diff --git a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py index 6bea42a..3b4f862 100644 --- a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py +++ b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py @@ -1,7 +1,57 @@ import requests -from dataclasses import dataclass +import json +import logging -class ApiInfoRetriever: +class NMDCAPIInterface: + """ + A generic interface for the NMDC runtime API. + + Attributes + ---------- + base_url : str + The base URL for the NMDC runtime API. + + Methods + ------- + validate_json(json_path: str) -> None: + Validates a json file using the NMDC json validate endpoint. + """ + + def __init__(self): + self.base_url = "https://api.microbiomedata.org" + + def validate_json(self, json_path) -> None: + """ + Validates a json file using the NMDC json validate endpoint. + + If the validation passes, the method returns without any side effects. + + Parameters + ---------- + json_path : str + The path to the json file to be validated. + + Raises + ------ + Exception + If the validation fails. + """ + with open(json_path, 'r') as f: + data = json.load(f) + + url = f"{self.base_url}/metadata/json:validate" + headers = { + 'accept': 'application/json', + 'Content-Type': 'application/json' + } + response = requests.post(url, headers=headers, json=data) + if response.status_code != 200: + logging.error(f"Request failed with status code {response.status_code}") + logging.error(response.text) + raise Exception("Validation failed") + + +class ApiInfoRetriever(NMDCAPIInterface): """ A class to retrieve API information from a specified collection. @@ -28,6 +78,7 @@ def __init__(self, collection_name: str): collection_name : str The name of the collection to be used for API queries. """ + super().__init__() self.collection_name = collection_name def get_id_by_name_from_collection(self, name_field_value: str) -> str: @@ -61,15 +112,53 @@ def get_id_by_name_from_collection(self, name_field_value: str) -> str: filter_param = f'{{"name": "{name_field_value}"}}' field = "id" - og_url = f'https://api.microbiomedata.org/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}' - + og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}" + try: resp = requests.get(og_url) resp.raise_for_status() # Raises an HTTPError for bad responses data = resp.json() - identifier = data['resources'][0]['id'] + identifier = data["resources"][0]["id"] return identifier except requests.RequestException as e: raise requests.RequestException(f"Error making API request: {e}") except (KeyError, IndexError) as e: raise IndexError(f"No matching entry found for '{name_field_value}': {e}") + + def check_if_ids_exist(self, ids: list) -> bool: + """ + Check if the IDs exist in the collection. + + This method constructs a query to the API to filter the collection based on the given IDs, and checks if all IDs exist in the collection. + + Parameters + ---------- + ids : list + A list of IDs to check if they exist in the collection. + + Returns + ------- + bool + True if all IDs exist in the collection, False otherwise. + + Raises + ------ + requests.RequestException + If there's an error in making the API request. + """ + ids_test = list(set(ids)) + ids_test = [id.replace('"', "'") for id in ids_test] + ids_test_str = ", ".join(f'"{id}"' for id in ids_test) + filter_param = f'{{"id": {{"$in": [{ids_test_str}]}}}}' + og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection=id" + + try: + resp = requests.get(og_url) + resp.raise_for_status() # Raises an HTTPError for bad responses + data = resp.json() + if len(data["resources"]) != len(ids_test): + return False + except requests.RequestException as e: + raise requests.RequestException(f"Error making API request: {e}") + + return True diff --git a/metaMS/nmdc_lipidomics_metadata_generation/example_metadata_file.csv b/metaMS/nmdc_lipidomics_metadata_generation/example_metadata_file.csv index e2524b8..07e2268 100644 --- a/metaMS/nmdc_lipidomics_metadata_generation/example_metadata_file.csv +++ b/metaMS/nmdc_lipidomics_metadata_generation/example_metadata_file.csv @@ -1,5 +1,5 @@ -Biosample Id,Associated Study,Processing Type,Raw Data File,Raw Data Object Alt Id,Processed Data Directory,mass spec configuration name,lc config name,instrument used,processing institution,instrument analysis start date,instrument analysis end date,execution resource -nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test1.raw,https://status.my.emsl.pnl.gov/view/1834807,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,15T FT-ICR MS ,EMSL,01/22/2023,03/04/2024,EMSL-RZR -nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test2.raw,https://status.my.emsl.pnl.gov/view/1834808,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,12T FT-ICR MS ,EMSL,01/22/2023,03/04/2024,EMSL-RZR -nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test3.raw,https://status.my.emsl.pnl.gov/view/1554801,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,VOrbiETD04,EMSL,01/22/2023,03/04/2024,EMSL-RZR -nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test4.raw,https://status.my.emsl.pnl.gov/view/1554809,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,Agilent GC-MS (2009),EMSL,01/22/2023,03/04/2024,EMSL-RZR \ No newline at end of file +Biosample Id,Associated Study,Processing Type,Raw Data File,Processed Data Directory,mass spec configuration name,lc config name,instrument used,processing institution,instrument analysis start date,instrument analysis end date,execution resource +nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test1.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,15T FT-ICR MS ,EMSL,1/22/23,3/4/24,EMSL-RZR +nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test2.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,12T FT-ICR MS ,EMSL,1/22/23,3/4/24,EMSL-RZR +nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test3.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,VOrbiETD04,EMSL,1/22/23,3/4/24,EMSL-RZR +nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test4.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,Agilent GC-MS (2009),EMSL,1/22/23,3/4/24,EMSL-RZR \ No newline at end of file diff --git a/metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py b/metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py index 67c654c..cd61b26 100644 --- a/metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py +++ b/metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py @@ -15,7 +15,7 @@ import nmdc_schema.nmdc as nmdc from linkml_runtime.dumpers import json_dumper -from api_info_retriever import ApiInfoRetriever +from api_info_retriever import ApiInfoRetriever, NMDCAPIInterface # Configure logging logging.basicConfig( @@ -57,8 +57,6 @@ class WorkflowMetadata: Directory containing processed data files. raw_data_file : str Path or name of the raw data file. - raw_data_object_alt_id : str - Alternative identifier for the raw data object. mass_spec_config_name : str Name of the mass spectrometry configuration used. lc_config_name : str @@ -74,7 +72,6 @@ class WorkflowMetadata: """ processed_data_dir: str raw_data_file: str - raw_data_object_alt_id: str mass_spec_config_name: str lc_config_name: str instrument_used: str @@ -209,7 +206,7 @@ def __init__( 'processing institution' ] self.mass_spec_desc = ( - "Analysis of raw mass spectrometry data for the annotation of lipids." + "Generation of mass spectrometry data for the analysis of lipids." ) self.mass_spec_eluent_intro = "liquid_chromatography" self.analyte_category = "lipidome" @@ -302,14 +299,15 @@ def run(self): description=self.raw_data_obj_desc, base_url=self.raw_data_url, was_generated_by=mass_spec.id, - alternative_id=workflow_metadata_obj.raw_data_object_alt_id ) metab_analysis = self.generate_metabolomics_analysis( cluster_name=workflow_metadata_obj.execution_resource, raw_data_name=Path(workflow_metadata_obj.raw_data_file).name, + raw_data_id=raw_data_object.id, data_gen_id=mass_spec.id, processed_data_id="nmdc:placeholder", + parameter_data_id="nmdc:placeholder", processing_institution=group_metadata_obj.processing_institution ) @@ -332,7 +330,7 @@ def run(self): was_generated_by=metab_analysis.id ) nmdc_database_inst.data_object_set.append(processed_data_object) - processed_data.append(processed_data_object.id) + parameter_data_id = processed_data_object.id elif file_type == 'csv': processed_data_object = self.generate_data_object( @@ -369,6 +367,7 @@ def run(self): mass_spec_obj=mass_spec, analysis_obj=metab_analysis, raw_data_obj=raw_data_object, + parameter_data_id=parameter_data_id, processed_data_id_list=processed_data ) @@ -377,6 +376,8 @@ def run(self): nmdc_database_inst.workflow_execution_set.append(metab_analysis) self.dump_nmdc_database(nmdc_database=nmdc_database_inst) + api_interface = NMDCAPIInterface() + api_interface.validate_json(self.database_dump_json_path) logging.info("Metadata processing completed.") def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy: @@ -384,7 +385,7 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy: Load and group workflow metadata from a CSV file. This method reads the metadata CSV file, checks for uniqueness in - specified columns, and groups the data by biosample ID. + specified columns, checks that biosamples exist, and groups the data by biosample ID. Returns ------- @@ -396,7 +397,7 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy: FileNotFoundError If the `metadata_file` does not exist. ValueError - If values in columns 'Raw Data File', 'Raw Data Object Alt Id', + If values in columns 'Raw Data File', and 'Processed Data Directory' are not unique. Notes @@ -412,12 +413,18 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy: # Check for uniqueness in specified columns columns_to_check = [ 'Raw Data File', - 'Raw Data Object Alt Id', 'Processed Data Directory' ] for column in columns_to_check: if not metadata_df[column].is_unique: raise ValueError(f"Duplicate values found in column '{column}'.") + + # Check that all biosamples exist + biosample_ids = metadata_df['Biosample Id'].unique() + api_biosample_getter = ApiInfoRetriever(collection_name="biosample_set") + + if not api_biosample_getter.check_if_ids_exist(biosample_ids): + raise ValueError("Biosample IDs do not exist in the collection.") # Group by Biosample grouped = metadata_df.groupby('Biosample Id') @@ -481,7 +488,6 @@ def create_workflow_metadata( return WorkflowMetadata( processed_data_dir=row['Processed Data Directory'], raw_data_file=row['Raw Data File'], - raw_data_object_alt_id=row['Raw Data Object Alt Id'], mass_spec_config_name=row['mass spec configuration name'], lc_config_name=row['lc config name'], instrument_used=row['instrument used'], @@ -590,8 +596,6 @@ def generate_mass_spectrometry( ----- This method uses the ApiInfoRetriever to fetch IDs for the instrument and configurations. It also mints a new NMDC ID for the DataGeneration object. - - TODO: Update docstring with new variables (e.g. analyte_category). """ nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MassSpectrometry)[0] @@ -704,8 +708,10 @@ def generate_metabolomics_analysis( self, cluster_name: str, raw_data_name: str, + raw_data_id: str, data_gen_id: str, processed_data_id: str, + parameter_data_id: str, processing_institution: str ) -> nmdc.MetabolomicsAnalysis: """ @@ -720,10 +726,14 @@ def generate_metabolomics_analysis( Name of the cluster or computing resource used for the analysis. raw_data_name : str Name of the raw data file that was analyzed. + raw_data_id : str + ID of the raw data object that was analyzed. data_gen_id : str ID of the DataGeneration object that generated the raw data. processed_data_id : str ID of the processed data resulting from the analysis. + parameter_data_id : str + ID of the parameter data object used for the analysis. processing_institution : str Name of the institution where the analysis was performed. @@ -738,7 +748,8 @@ def generate_metabolomics_analysis( placeholder values and should be updated with actual timestamps later when the processed files are iterated over in the run method. """ - nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MetabolomicsAnalysis)[0] + nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MetabolomicsAnalysis)[0]+".1" + #TODO: Update the minting to handle versioning in the future data_dict = { 'id': nmdc_id, @@ -749,7 +760,7 @@ def generate_metabolomics_analysis( 'git_url': self.workflow_git_url, 'version': self.workflow_version, 'was_informed_by': data_gen_id, - 'has_input': [raw_data_name], + 'has_input': [raw_data_id, parameter_data_id], 'has_output': [processed_data_id], 'started_at_time': 'placeholder', 'ended_at_time': 'placeholder', @@ -765,6 +776,7 @@ def update_outputs( mass_spec_obj: object, analysis_obj: object, raw_data_obj: object, + parameter_data_id: str, processed_data_id_list: list ) -> None: """ @@ -798,6 +810,7 @@ def update_outputs( - Sets `analysis_obj.has_output` to `processed_data_id_list`. """ mass_spec_obj.has_output = [raw_data_obj.id] + analysis_obj.has_input[1] = parameter_data_id analysis_obj.has_output = processed_data_id_list def start_nmdc_database(self) -> nmdc.Database: