refactor: study metadata data model provider is refactored. added a d…

…efault fix_new_lines_in_cells parameter to parse isa table file
EBI-Metabolights · Oct 14, 2024 · 723e757 · 723e757
1 parent 46f81a0
commit 723e757
Show file tree

Hide file tree

Showing 5 changed files with 817 additions and 35 deletions.
diff --git a/metabolights_utils/isatab/default/parser/isa_table_parser.py b/metabolights_utils/isatab/default/parser/isa_table_parser.py
@@ -21,29 +21,64 @@
 from metabolights_utils.utils.hash_utils import MetabolightsHashUtils as HashUtils
 
 
-def fix_empty_rows(
+def fix_isa_table_file(
     file_path: str,
     messages: List[ParserMessage],
     read_encoding: str,
     write_encoding: str,
+    fix_empty_rows: bool = True,
+    fix_new_lines_in_cells: bool = True,
+    max_iteration_to_fix_new_lines_in_cells: int = 5,
 ):
+    if not fix_new_lines_in_cells and not fix_empty_rows:
+        return
+    basename = os.path.basename(file_path)
     with open(file_path, "r", encoding=read_encoding) as f:
-        lines = f.readlines()
-        updated_lines = [
-            line.strip("\n").strip("\r") for line in lines if line and line.strip()
-        ]
-        updated_lines = [f"{line}\n" for line in updated_lines]
-        if len(updated_lines) != len(lines):
-            basename = os.path.basename(file_path)
-            messages.append(
-                ParserMessage(
-                    type=ParserMessageType.WARNING,
-                    short=f"Empty rows are in file: {basename}",
-                    detail=f"Empty rows are removed from {basename}",
+        file_content = f.read()
+        find_empty_lines = None
+        find_new_lines_in_cells = None
+        if fix_empty_rows:
+            find_empty_lines = re.findall(r"[\r\n][\r\n]+", file_content)
+            if find_empty_lines:
+                messages.append(
+                    ParserMessage(
+                        type=ParserMessageType.WARNING,
+                        short=f"Removed empty lines in {basename}.",
+                        detail=f"Removed empty lines in {basename}",
+                    )
                 )
+                file_content = re.sub(r"[\r\n][\r\n]+", r"\n", file_content)
+        if fix_new_lines_in_cells:
+            new_line_in_cells_pattern = r'\t"([^\t]*)([\r\n]+)([^\t]*)"\t'
+            find_new_lines_in_cells = re.findall(
+                new_line_in_cells_pattern, file_content
             )
+
+            if find_new_lines_in_cells:
+                max_iteration = max_iteration_to_fix_new_lines_in_cells
+                iteration = 0
+                while True:
+                    new_file_content = re.sub(
+                        new_line_in_cells_pattern, r'\t"\1\3"\t', file_content
+                    )
+                    if new_file_content == file_content:
+                        break
+                    if iteration < 1:
+                        messages.append(
+                            ParserMessage(
+                                type=ParserMessageType.WARNING,
+                                short=f"Removed new line characters in {basename} file table cells.",
+                                detail=f"Removed new line characters in {basename} file table cells.",
+                            )
+                        )
+                    iteration += 1
+                    if iteration > max_iteration:
+                        break
+                    file_content = new_file_content
+
+        if find_empty_lines or find_new_lines_in_cells:
             with open(file_path, "w", encoding=write_encoding) as f:
-                f.writelines(updated_lines)
+                f.write(file_content)
 
 
 def parse_isa_file_content(
@@ -52,18 +87,27 @@ def parse_isa_file_content(
     messages: List[ParserMessage],
     fix_unicode_exceptions: bool = False,
     remove_empty_rows: bool = False,
+    remove_new_lines_in_cells: bool = False,
 ) -> Tuple[IsaTableFile, List[ParserMessage]]:
     try:
         if remove_empty_rows:
-            fix_empty_rows(file_path, messages, "utf-8", "utf-8")
+            fix_isa_table_file(file_path, messages, "utf-8", "utf-8")
         with open(file_path, "r", encoding="utf-8") as f:
             model = parser(f, messages=messages)
             return model, messages
     except UnicodeDecodeError as ex:
         if fix_unicode_exceptions:
             try:
                 if remove_empty_rows:
-                    fix_empty_rows(file_path, messages, "latin-1", "latin-1")
+                    fix_isa_table_file(
+                        file_path,
+                        messages,
+                        "latin-1",
+                        "latin-1",
+                        remove_empty_rows,
+                        remove_new_lines_in_cells,
+                        max_iteration_to_fix_new_lines_in_cells=5,
+                    )
                 with open(file_path, "r", encoding="latin-1") as f:
                     model = parser(f, messages=messages)
                     message = ParserMessage(
@@ -106,6 +150,7 @@ def parse_isa_table_sheet_from_fs(
     sort_options: List[TsvFileSortOption] = None,
     fix_unicode_exceptions: bool = False,
     remove_empty_rows: bool = False,
+    remove_new_lines_in_cells: bool = False
 ) -> Tuple[IsaTableFile, List[ParserMessage]]:
     basename = os.path.basename(file_path)
     dirname = os.path.basename(os.path.dirname(file_path))
@@ -154,6 +199,7 @@ def parse_isa_table_sheet_from_fs(
         messages=read_messages,
         fix_unicode_exceptions=fix_unicode_exceptions,
         remove_empty_rows=remove_empty_rows,
+        remove_new_lines_in_cells=remove_new_lines_in_cells
     )
     isa_table_file: IsaTableFile = table
     if isa_table_file:

diff --git a/metabolights_utils/provider/study_provider.py b/metabolights_utils/provider/study_provider.py
@@ -1,6 +1,8 @@
 import logging
 import os
+
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import List, Set, Tuple, Union
 
 from metabolights_utils.isatab.default.parser.investigation_parser import (
@@ -40,6 +42,42 @@
 logger = logging.getLogger(__name__)
 
 
+class AbstractMetadataFileProvider(ABC):
+
+    @abstractmethod
+    def get_study_metadata_path(
+        self, study_id: str, file_relative_path: Union[None, str] = None
+    ) -> str:
+        pass
+
+    @abstractmethod
+    def exists(
+        self, study_id: str, file_relative_path: Union[None, str] = None
+    ) -> bool:
+        pass
+
+
+class DefaultStudyMetadataFileProvider(AbstractMetadataFileProvider):
+    def __init__(self, study_metadata_root_path: str):
+        self.study_metadata_root_path = study_metadata_root_path
+
+    def get_study_metadata_path(
+        self, study_id: str, file_relative_path: Union[None, str] = None
+    ) -> str:
+        if file_relative_path:
+            return os.path.join(
+                self.study_metadata_root_path, study_id, file_relative_path
+            )
+        else:
+            return os.path.join(self.study_metadata_root_path, study_id)
+
+    def exists(
+        self, study_id: str, file_relative_path: Union[None, str] = None
+    ) -> bool:
+        file_path = Path(self.get_study_metadata_path(study_id, file_relative_path))
+        return file_path.resolve().exists()
+
+
 class AbstractDbMetadataCollector(ABC):
 
     @abstractmethod
@@ -90,9 +128,11 @@ def __init__(
         self,
         db_metadata_collector: Union[None, AbstractDbMetadataCollector] = None,
         folder_metadata_collector: Union[None, AbstractFolderMetadataCollector] = None,
+        metadata_file_provider: Union[None, AbstractMetadataFileProvider] = None,
     ) -> None:
         self.db_metadata_collector = db_metadata_collector
         self.folder_metadata_collector = folder_metadata_collector
+        self.metadata_file_provider = metadata_file_provider
 
     def _add_parse_messages(
         self,
@@ -175,6 +215,36 @@ def set_organisms(self, samples_file: SamplesFile, isa_table: SamplesFile):
         )
         samples_file.organism_and_organism_part_pairs = list(pairs)
 
+    def get_file_path(
+        self,
+        relative_file_path: str,
+        folder: Union[None, str],
+        study_id: Union[None, str],
+    ):
+        if not folder:
+            if not self.metadata_file_provider:
+                raise ValueError("Define metadata file povider if folder is None.")
+            file_path = self.metadata_file_provider.get_study_metadata_path(
+                study_id, relative_file_path
+            )
+        else:
+            file_path = os.path.join(folder, relative_file_path)
+        return file_path
+
+    def get_study_metadata_path(
+        self,
+        folder: Union[None, str],
+        study_id: Union[None, str],
+    ) -> Tuple[str, bool]:
+        if not folder:
+            study_path = self.metadata_file_provider.get_study_metadata_path(study_id)
+            exist = self.metadata_file_provider.exists(study_path)
+        else:
+            study_path = folder
+            real_path = os.path.realpath(folder)
+            exist = os.path.exists(real_path)
+        return study_path, exist
+
     def get_phase1_input_data(
         self,
         study_id: str,
@@ -183,7 +253,7 @@ def get_phase1_input_data(
     ) -> MetabolightsStudyModel:
         model: MetabolightsStudyModel = MetabolightsStudyModel()
         logger.debug("Load i_Investigation.txt file on %s for %s", folder, study_id)
-        self.update_investigation_file(model, folder)
+        self.update_investigation_file(model, folder, study_id=study_id)
         if self.db_metadata_collector and connection:
             logger.debug("Load %s study database metadata.", study_id)
             self.update_study_db_metadata(
@@ -213,7 +283,9 @@ def get_phase1_input_data(
         folders_in_hierarchy = set()
         investigation = model.investigation
         for study_item in investigation.studies:
-            file_path = os.path.join(folder, study_item.file_name)
+
+            file_path = self.get_file_path(study_item.file_name, folder, study_id)
+
             logger.debug("Load sample file headers %s", study_item.file_name)
             samples_isa_table, messages = parse_isa_table_sheet_from_fs(
                 file_path,
@@ -257,7 +329,8 @@ def get_phase1_input_data(
 
             model.samples[study_item.file_name] = samples_file
             for assay_item in study_item.study_assays.assays:
-                file_path = os.path.join(folder, assay_item.file_name)
+                file_path = self.get_file_path(assay_item.file_name, folder, study_id)
+
                 logger.debug(
                     "Load %s assay file headers for %s.", assay_item.file_name, study_id
                 )
@@ -395,7 +468,7 @@ def get_phase1_input_data(
         model.folders_in_hierarchy.extend(list(folders_in_hierarchy))
 
         for assignment_file in assignment_files:
-            absolute_path = os.path.join(folder, assignment_file)
+            absolute_path = self.get_file_path(assignment_file, folder, study_id)
             logger.debug(
                 "Load %s assignment file  headers for %s.", assignment_file, study_id
             )
@@ -487,7 +560,7 @@ def get_sample_file_input(
             model = self.get_phase1_input_data(study_id, folder, connection)
 
         for study_item in model.investigation.studies:
-            file_path = os.path.join(folder, study_item.file_name)
+            file_path = self.get_file_path(study_item.file_name, folder, study_id)
             samples_isa_table_sheet, messages = parse_isa_table_sheet_from_fs(
                 file_path,
                 samples_file_expected_patterns,
@@ -526,7 +599,8 @@ def get_phase2_input_data(
             model = self.get_phase1_input_data(study_id, folder, connection)
 
         for study_item in model.investigation.studies:
-            file_path = os.path.join(folder, study_item.file_name)
+            file_path = self.get_file_path(study_item.file_name, folder, study_id)
+
             samples_isa_table_sheet, messages = parse_isa_table_sheet_from_fs(
                 file_path,
                 samples_file_expected_patterns,
@@ -551,7 +625,8 @@ def get_phase2_input_data(
                 self.filter_messages(messages)
             )
             for assay_item in study_item.study_assays.assays:
-                file_path = os.path.join(folder, assay_item.file_name)
+                file_path = self.get_file_path(assay_item.file_name, folder, study_id)
+
                 assay_isa_table_sheet, messages = parse_isa_table_sheet_from_fs(
                     file_path,
                     assay_file_expected_patterns,
@@ -596,19 +671,17 @@ def get_phase3_input_data(
             model = self.get_phase1_input_data(study_id, folder, connection)
 
         for assignment_file in model.metabolite_assignments:
-            absolute_path = os.path.join(folder, assignment_file)
+            absolute_path = self.get_file_path(assignment_file, folder, study_id)
             (
-                metabolite_assignment_isa_table_sheet,
+                maf_isa_table_sheet,
                 messages,
             ) = parse_isa_table_sheet_from_fs(
                 absolute_path,
                 offset=assignment_sheet_offset,
                 limit=assignment_sheet_limit,
                 fix_unicode_exceptions=True,
             )
-            metabolite_assignment_isa_table: IsaTableFile = (
-                metabolite_assignment_isa_table_sheet
-            )
+            metabolite_assignment_isa_table: IsaTableFile = maf_isa_table_sheet
             model.parser_messages[assignment_file].extend(
                 self.filter_messages(messages)
             )
@@ -722,13 +795,17 @@ def load_study(
         calculate_data_folder_size: bool = False,
         calculate_metadata_size: bool = False,
     ) -> MetabolightsStudyModel:
-        if not study_id or not study_path:
-            raise ValueError("invalid study_id or study_path")
-        real_path = os.path.realpath(study_path)
-        if not os.path.exists(real_path):
+        if not study_id:
+            raise ValueError("invalid study_id")
+        exist = False
+        study_path, exist = self.get_study_metadata_path(study_path, study_id)
+
+        if not study_path:
+            raise ValueError("invalid study_path")
+        if not exist:
             model = MetabolightsStudyModel()
             message = CriticalMessage(
-                short=f"Study folder does not exist for {study_id}"
+                short=f"Study folder does not exist for {study_id} {study_path}"
             )
             model.folder_reader_messages.append(message)
             return model
@@ -786,9 +863,13 @@ def load_study(
         return model
 
     def update_investigation_file(
-        self, model: MetabolightsStudyModel, folder, file_name="i_Investigation.txt"
+        self,
+        model: MetabolightsStudyModel,
+        folder,
+        file_name="i_Investigation.txt",
+        study_id: Union[None, str] = None,
     ):
-        file = os.path.join(folder, file_name)
+        file = self.get_file_path(file_name, folder, study_id)
         investigation, messages = parse_investigation_from_fs(
             file, fix_unicode_exceptions=True
         )