Merge branch 'lipid_dev_metadat' into 'lipid_dev'

Make small modifications to the lipid metadata generation scripts See merge request mass-spectrometry/metams!11
microbiomedata · Dec 19, 2024 · b379afa · b379afa
2 parents 259cf3b + 95bc156
commit b379afa
Show file tree

Hide file tree

Showing 3 changed files with 127 additions and 25 deletions.
diff --git a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py
@@ -1,7 +1,57 @@
 import requests
-from dataclasses import dataclass
+import json
+import logging
 
-class ApiInfoRetriever:
+class NMDCAPIInterface:
+    """
+    A generic interface for the NMDC runtime API.
+
+    Attributes
+    ----------
+    base_url : str
+        The base URL for the NMDC runtime API.
+
+    Methods
+    -------
+    validate_json(json_path: str) -> None:
+        Validates a json file using the NMDC json validate endpoint.
+    """
+
+    def __init__(self):
+        self.base_url = "https://api.microbiomedata.org"
+
+    def validate_json(self, json_path) -> None:
+        """
+        Validates a json file using the NMDC json validate endpoint.
+
+        If the validation passes, the method returns without any side effects.
+
+        Parameters
+        ----------
+        json_path : str
+            The path to the json file to be validated.
+
+        Raises
+        ------
+        Exception
+            If the validation fails.
+        """
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+
+        url = f"{self.base_url}/metadata/json:validate"
+        headers = {
+            'accept': 'application/json',
+            'Content-Type': 'application/json'
+        }
+        response = requests.post(url, headers=headers, json=data)
+        if response.status_code != 200:
+            logging.error(f"Request failed with status code {response.status_code}")
+            logging.error(response.text)
+            raise Exception("Validation failed")
+
+
+class ApiInfoRetriever(NMDCAPIInterface):
     """
     A class to retrieve API information from a specified collection.
 
@@ -28,6 +78,7 @@ def __init__(self, collection_name: str):
         collection_name : str
             The name of the collection to be used for API queries.
         """
+        super().__init__()
         self.collection_name = collection_name
 
     def get_id_by_name_from_collection(self, name_field_value: str) -> str:
@@ -61,15 +112,53 @@ def get_id_by_name_from_collection(self, name_field_value: str) -> str:
         filter_param = f'{{"name": "{name_field_value}"}}'
         field = "id"
 
-        og_url = f'https://api.microbiomedata.org/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}'
-        
+        og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}"
+
         try:
             resp = requests.get(og_url)
             resp.raise_for_status()  # Raises an HTTPError for bad responses
             data = resp.json()
-            identifier = data['resources'][0]['id']
+            identifier = data["resources"][0]["id"]
             return identifier
         except requests.RequestException as e:
             raise requests.RequestException(f"Error making API request: {e}")
         except (KeyError, IndexError) as e:
             raise IndexError(f"No matching entry found for '{name_field_value}': {e}")
+
+    def check_if_ids_exist(self, ids: list) -> bool:
+        """
+        Check if the IDs exist in the collection.
+
+        This method constructs a query to the API to filter the collection based on the given IDs, and checks if all IDs exist in the collection.
+
+        Parameters
+        ----------
+        ids : list
+            A list of IDs to check if they exist in the collection.
+
+        Returns
+        -------
+        bool
+            True if all IDs exist in the collection, False otherwise.
+
+        Raises
+        ------
+        requests.RequestException
+            If there's an error in making the API request.
+        """
+        ids_test = list(set(ids))
+        ids_test = [id.replace('"', "'") for id in ids_test]
+        ids_test_str = ", ".join(f'"{id}"' for id in ids_test)
+        filter_param = f'{{"id": {{"$in": [{ids_test_str}]}}}}'
+        og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection=id"
+
+        try:
+            resp = requests.get(og_url)
+            resp.raise_for_status()  # Raises an HTTPError for bad responses
+            data = resp.json()
+            if len(data["resources"]) != len(ids_test):
+                return False
+        except requests.RequestException as e:
+            raise requests.RequestException(f"Error making API request: {e}")
+
+        return True
diff --git a/metaMS/nmdc_lipidomics_metadata_generation/example_metadata_file.csv b/metaMS/nmdc_lipidomics_metadata_generation/example_metadata_file.csv
@@ -1,5 +1,5 @@
-Biosample Id,Associated Study,Processing Type,Raw Data File,Raw Data Object Alt Id,Processed Data Directory,mass spec configuration name,lc config name,instrument used,processing institution,instrument analysis start date,instrument analysis end date,execution resource
-nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test1.raw,https://status.my.emsl.pnl.gov/view/1834807,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,15T FT-ICR MS ,EMSL,01/22/2023,03/04/2024,EMSL-RZR
-nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test2.raw,https://status.my.emsl.pnl.gov/view/1834808,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,12T FT-ICR MS ,EMSL,01/22/2023,03/04/2024,EMSL-RZR
-nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test3.raw,https://status.my.emsl.pnl.gov/view/1554801,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,VOrbiETD04,EMSL,01/22/2023,03/04/2024,EMSL-RZR
-nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test4.raw,https://status.my.emsl.pnl.gov/view/1554809,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,Agilent GC-MS (2009),EMSL,01/22/2023,03/04/2024,EMSL-RZR
+Biosample Id,Associated Study,Processing Type,Raw Data File,Processed Data Directory,mass spec configuration name,lc config name,instrument used,processing institution,instrument analysis start date,instrument analysis end date,execution resource
+nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test1.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,15T FT-ICR MS ,EMSL,1/22/23,3/4/24,EMSL-RZR
+nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test2.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,12T FT-ICR MS ,EMSL,1/22/23,3/4/24,EMSL-RZR
+nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test3.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,VOrbiETD04,EMSL,1/22/23,3/4/24,EMSL-RZR
+nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test4.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,Agilent GC-MS (2009),EMSL,1/22/23,3/4/24,EMSL-RZR
diff --git a/metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py b/metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py
@@ -15,7 +15,7 @@
 
 import nmdc_schema.nmdc as nmdc
 from linkml_runtime.dumpers import json_dumper
-from api_info_retriever import ApiInfoRetriever
+from api_info_retriever import ApiInfoRetriever, NMDCAPIInterface
 
 # Configure logging
 logging.basicConfig(
@@ -57,8 +57,6 @@ class WorkflowMetadata:
         Directory containing processed data files.
     raw_data_file : str
         Path or name of the raw data file.
-    raw_data_object_alt_id : str
-        Alternative identifier for the raw data object.
     mass_spec_config_name : str
         Name of the mass spectrometry configuration used.
     lc_config_name : str
@@ -74,7 +72,6 @@ class WorkflowMetadata:
     """
     processed_data_dir: str
     raw_data_file: str
-    raw_data_object_alt_id: str
     mass_spec_config_name: str
     lc_config_name: str
     instrument_used: str
@@ -209,7 +206,7 @@ def __init__(
             'processing institution'
         ]
         self.mass_spec_desc = (
-            "Analysis of raw mass spectrometry data for the annotation of lipids."
+            "Generation of mass spectrometry data for the analysis of lipids."
         )
         self.mass_spec_eluent_intro = "liquid_chromatography"
         self.analyte_category = "lipidome"
@@ -302,14 +299,15 @@ def run(self):
                     description=self.raw_data_obj_desc,
                     base_url=self.raw_data_url,
                     was_generated_by=mass_spec.id,
-                    alternative_id=workflow_metadata_obj.raw_data_object_alt_id
                 )
 
                 metab_analysis = self.generate_metabolomics_analysis(
                     cluster_name=workflow_metadata_obj.execution_resource,
                     raw_data_name=Path(workflow_metadata_obj.raw_data_file).name,
+                    raw_data_id=raw_data_object.id,
                     data_gen_id=mass_spec.id,
                     processed_data_id="nmdc:placeholder",
+                    parameter_data_id="nmdc:placeholder",
                     processing_institution=group_metadata_obj.processing_institution
                 )
 
@@ -332,7 +330,7 @@ def run(self):
                                 was_generated_by=metab_analysis.id
                             )
                             nmdc_database_inst.data_object_set.append(processed_data_object)
-                            processed_data.append(processed_data_object.id)
+                            parameter_data_id = processed_data_object.id
 
                         elif file_type == 'csv':
                             processed_data_object = self.generate_data_object(
@@ -369,6 +367,7 @@ def run(self):
                     mass_spec_obj=mass_spec,
                     analysis_obj=metab_analysis,
                     raw_data_obj=raw_data_object,
+                    parameter_data_id=parameter_data_id,
                     processed_data_id_list=processed_data
                 )
 
@@ -377,14 +376,16 @@ def run(self):
                 nmdc_database_inst.workflow_execution_set.append(metab_analysis)
 
         self.dump_nmdc_database(nmdc_database=nmdc_database_inst)
+        api_interface = NMDCAPIInterface()
+        api_interface.validate_json(self.database_dump_json_path)
         logging.info("Metadata processing completed.")
 
     def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy:
         """
         Load and group workflow metadata from a CSV file.
 
         This method reads the metadata CSV file, checks for uniqueness in
-        specified columns, and groups the data by biosample ID.
+        specified columns, checks that biosamples exist, and groups the data by biosample ID.
 
         Returns
         -------
@@ -396,7 +397,7 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy:
         FileNotFoundError
             If the `metadata_file` does not exist.
         ValueError
-            If values in columns 'Raw Data File', 'Raw Data Object Alt Id',
+            If values in columns 'Raw Data File',
             and 'Processed Data Directory' are not unique.
 
         Notes
@@ -412,12 +413,18 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy:
         # Check for uniqueness in specified columns
         columns_to_check = [
             'Raw Data File',
-            'Raw Data Object Alt Id',
             'Processed Data Directory'
         ]
         for column in columns_to_check:
             if not metadata_df[column].is_unique:
                 raise ValueError(f"Duplicate values found in column '{column}'.")
+
+        # Check that all biosamples exist
+        biosample_ids = metadata_df['Biosample Id'].unique()
+        api_biosample_getter = ApiInfoRetriever(collection_name="biosample_set")
+
+        if not api_biosample_getter.check_if_ids_exist(biosample_ids):
+            raise ValueError("Biosample IDs do not exist in the collection.")
 
         # Group by Biosample
         grouped = metadata_df.groupby('Biosample Id')
@@ -481,7 +488,6 @@ def create_workflow_metadata(
         return WorkflowMetadata(
             processed_data_dir=row['Processed Data Directory'],
             raw_data_file=row['Raw Data File'],
-            raw_data_object_alt_id=row['Raw Data Object Alt Id'],
             mass_spec_config_name=row['mass spec configuration name'],
             lc_config_name=row['lc config name'],
             instrument_used=row['instrument used'],
@@ -590,8 +596,6 @@ def generate_mass_spectrometry(
         -----
         This method uses the ApiInfoRetriever to fetch IDs for the instrument
         and configurations. It also mints a new NMDC ID for the DataGeneration object.
-
-        TODO: Update docstring with new variables (e.g. analyte_category).
         """
         nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MassSpectrometry)[0]
 
@@ -704,8 +708,10 @@ def generate_metabolomics_analysis(
         self,
         cluster_name: str,
         raw_data_name: str,
+        raw_data_id: str,
         data_gen_id: str,
         processed_data_id: str,
+        parameter_data_id: str,
         processing_institution: str
         ) -> nmdc.MetabolomicsAnalysis:
         """
@@ -720,10 +726,14 @@ def generate_metabolomics_analysis(
             Name of the cluster or computing resource used for the analysis.
         raw_data_name : str
             Name of the raw data file that was analyzed.
+        raw_data_id : str
+            ID of the raw data object that was analyzed.
         data_gen_id : str
             ID of the DataGeneration object that generated the raw data.
         processed_data_id : str
             ID of the processed data resulting from the analysis.
+        parameter_data_id : str
+            ID of the parameter data object used for the analysis.
         processing_institution : str
             Name of the institution where the analysis was performed.
 
@@ -738,7 +748,8 @@ def generate_metabolomics_analysis(
         placeholder values and should be updated with actual timestamps later
         when the processed files are iterated over in the run method.
         """
-        nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MetabolomicsAnalysis)[0]
+        nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MetabolomicsAnalysis)[0]+".1"
+        #TODO: Update the minting to handle versioning in the future
 
         data_dict = {
             'id': nmdc_id,
@@ -749,7 +760,7 @@ def generate_metabolomics_analysis(
             'git_url': self.workflow_git_url,
             'version': self.workflow_version,
             'was_informed_by': data_gen_id,
-            'has_input': [raw_data_name],
+            'has_input': [raw_data_id, parameter_data_id],
             'has_output': [processed_data_id],
             'started_at_time': 'placeholder',
             'ended_at_time': 'placeholder',
@@ -765,6 +776,7 @@ def update_outputs(
         mass_spec_obj: object,
         analysis_obj: object,
         raw_data_obj: object,
+        parameter_data_id: str,
         processed_data_id_list: list
         ) -> None:
         """
@@ -798,6 +810,7 @@ def update_outputs(
         - Sets `analysis_obj.has_output` to `processed_data_id_list`.
         """
         mass_spec_obj.has_output = [raw_data_obj.id]
+        analysis_obj.has_input[1] = parameter_data_id
         analysis_obj.has_output = processed_data_id_list
 
     def start_nmdc_database(self) -> nmdc.Database: