Skip to content

Commit

Permalink
refactor: study metadata data model provider is refactored. added a d…
Browse files Browse the repository at this point in the history
…efault fix_new_lines_in_cells parameter to parse isa table file
  • Loading branch information
oyurekten committed Oct 14, 2024
1 parent 46f81a0 commit 723e757
Show file tree
Hide file tree
Showing 5 changed files with 817 additions and 35 deletions.
78 changes: 62 additions & 16 deletions metabolights_utils/isatab/default/parser/isa_table_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,64 @@
from metabolights_utils.utils.hash_utils import MetabolightsHashUtils as HashUtils


def fix_empty_rows(
def fix_isa_table_file(
file_path: str,
messages: List[ParserMessage],
read_encoding: str,
write_encoding: str,
fix_empty_rows: bool = True,
fix_new_lines_in_cells: bool = True,
max_iteration_to_fix_new_lines_in_cells: int = 5,
):
if not fix_new_lines_in_cells and not fix_empty_rows:
return
basename = os.path.basename(file_path)
with open(file_path, "r", encoding=read_encoding) as f:
lines = f.readlines()
updated_lines = [
line.strip("\n").strip("\r") for line in lines if line and line.strip()
]
updated_lines = [f"{line}\n" for line in updated_lines]
if len(updated_lines) != len(lines):
basename = os.path.basename(file_path)
messages.append(
ParserMessage(
type=ParserMessageType.WARNING,
short=f"Empty rows are in file: {basename}",
detail=f"Empty rows are removed from {basename}",
file_content = f.read()
find_empty_lines = None
find_new_lines_in_cells = None
if fix_empty_rows:
find_empty_lines = re.findall(r"[\r\n][\r\n]+", file_content)
if find_empty_lines:
messages.append(
ParserMessage(
type=ParserMessageType.WARNING,
short=f"Removed empty lines in {basename}.",
detail=f"Removed empty lines in {basename}",
)
)
file_content = re.sub(r"[\r\n][\r\n]+", r"\n", file_content)
if fix_new_lines_in_cells:
new_line_in_cells_pattern = r'\t"([^\t]*)([\r\n]+)([^\t]*)"\t'
find_new_lines_in_cells = re.findall(
new_line_in_cells_pattern, file_content
)

if find_new_lines_in_cells:
max_iteration = max_iteration_to_fix_new_lines_in_cells
iteration = 0
while True:
new_file_content = re.sub(
new_line_in_cells_pattern, r'\t"\1\3"\t', file_content
)
if new_file_content == file_content:
break
if iteration < 1:
messages.append(
ParserMessage(
type=ParserMessageType.WARNING,
short=f"Removed new line characters in {basename} file table cells.",
detail=f"Removed new line characters in {basename} file table cells.",
)
)
iteration += 1
if iteration > max_iteration:
break
file_content = new_file_content

if find_empty_lines or find_new_lines_in_cells:
with open(file_path, "w", encoding=write_encoding) as f:
f.writelines(updated_lines)
f.write(file_content)


def parse_isa_file_content(
Expand All @@ -52,18 +87,27 @@ def parse_isa_file_content(
messages: List[ParserMessage],
fix_unicode_exceptions: bool = False,
remove_empty_rows: bool = False,
remove_new_lines_in_cells: bool = False,
) -> Tuple[IsaTableFile, List[ParserMessage]]:
try:
if remove_empty_rows:
fix_empty_rows(file_path, messages, "utf-8", "utf-8")
fix_isa_table_file(file_path, messages, "utf-8", "utf-8")
with open(file_path, "r", encoding="utf-8") as f:
model = parser(f, messages=messages)
return model, messages
except UnicodeDecodeError as ex:
if fix_unicode_exceptions:
try:
if remove_empty_rows:
fix_empty_rows(file_path, messages, "latin-1", "latin-1")
fix_isa_table_file(
file_path,
messages,
"latin-1",
"latin-1",
remove_empty_rows,
remove_new_lines_in_cells,
max_iteration_to_fix_new_lines_in_cells=5,
)
with open(file_path, "r", encoding="latin-1") as f:
model = parser(f, messages=messages)
message = ParserMessage(
Expand Down Expand Up @@ -106,6 +150,7 @@ def parse_isa_table_sheet_from_fs(
sort_options: List[TsvFileSortOption] = None,
fix_unicode_exceptions: bool = False,
remove_empty_rows: bool = False,
remove_new_lines_in_cells: bool = False
) -> Tuple[IsaTableFile, List[ParserMessage]]:
basename = os.path.basename(file_path)
dirname = os.path.basename(os.path.dirname(file_path))
Expand Down Expand Up @@ -154,6 +199,7 @@ def parse_isa_table_sheet_from_fs(
messages=read_messages,
fix_unicode_exceptions=fix_unicode_exceptions,
remove_empty_rows=remove_empty_rows,
remove_new_lines_in_cells=remove_new_lines_in_cells
)
isa_table_file: IsaTableFile = table
if isa_table_file:
Expand Down
119 changes: 100 additions & 19 deletions metabolights_utils/provider/study_provider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import os

from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Set, Tuple, Union

from metabolights_utils.isatab.default.parser.investigation_parser import (
Expand Down Expand Up @@ -40,6 +42,42 @@
logger = logging.getLogger(__name__)


class AbstractMetadataFileProvider(ABC):

@abstractmethod
def get_study_metadata_path(
self, study_id: str, file_relative_path: Union[None, str] = None
) -> str:
pass

@abstractmethod
def exists(
self, study_id: str, file_relative_path: Union[None, str] = None
) -> bool:
pass


class DefaultStudyMetadataFileProvider(AbstractMetadataFileProvider):
def __init__(self, study_metadata_root_path: str):
self.study_metadata_root_path = study_metadata_root_path

def get_study_metadata_path(
self, study_id: str, file_relative_path: Union[None, str] = None
) -> str:
if file_relative_path:
return os.path.join(
self.study_metadata_root_path, study_id, file_relative_path
)
else:
return os.path.join(self.study_metadata_root_path, study_id)

def exists(
self, study_id: str, file_relative_path: Union[None, str] = None
) -> bool:
file_path = Path(self.get_study_metadata_path(study_id, file_relative_path))
return file_path.resolve().exists()


class AbstractDbMetadataCollector(ABC):

@abstractmethod
Expand Down Expand Up @@ -90,9 +128,11 @@ def __init__(
self,
db_metadata_collector: Union[None, AbstractDbMetadataCollector] = None,
folder_metadata_collector: Union[None, AbstractFolderMetadataCollector] = None,
metadata_file_provider: Union[None, AbstractMetadataFileProvider] = None,
) -> None:
self.db_metadata_collector = db_metadata_collector
self.folder_metadata_collector = folder_metadata_collector
self.metadata_file_provider = metadata_file_provider

def _add_parse_messages(
self,
Expand Down Expand Up @@ -175,6 +215,36 @@ def set_organisms(self, samples_file: SamplesFile, isa_table: SamplesFile):
)
samples_file.organism_and_organism_part_pairs = list(pairs)

def get_file_path(
self,
relative_file_path: str,
folder: Union[None, str],
study_id: Union[None, str],
):
if not folder:
if not self.metadata_file_provider:
raise ValueError("Define metadata file povider if folder is None.")
file_path = self.metadata_file_provider.get_study_metadata_path(
study_id, relative_file_path
)
else:
file_path = os.path.join(folder, relative_file_path)
return file_path

def get_study_metadata_path(
self,
folder: Union[None, str],
study_id: Union[None, str],
) -> Tuple[str, bool]:
if not folder:
study_path = self.metadata_file_provider.get_study_metadata_path(study_id)
exist = self.metadata_file_provider.exists(study_path)
else:
study_path = folder
real_path = os.path.realpath(folder)
exist = os.path.exists(real_path)
return study_path, exist

def get_phase1_input_data(
self,
study_id: str,
Expand All @@ -183,7 +253,7 @@ def get_phase1_input_data(
) -> MetabolightsStudyModel:
model: MetabolightsStudyModel = MetabolightsStudyModel()
logger.debug("Load i_Investigation.txt file on %s for %s", folder, study_id)
self.update_investigation_file(model, folder)
self.update_investigation_file(model, folder, study_id=study_id)
if self.db_metadata_collector and connection:
logger.debug("Load %s study database metadata.", study_id)
self.update_study_db_metadata(
Expand Down Expand Up @@ -213,7 +283,9 @@ def get_phase1_input_data(
folders_in_hierarchy = set()
investigation = model.investigation
for study_item in investigation.studies:
file_path = os.path.join(folder, study_item.file_name)

file_path = self.get_file_path(study_item.file_name, folder, study_id)

logger.debug("Load sample file headers %s", study_item.file_name)
samples_isa_table, messages = parse_isa_table_sheet_from_fs(
file_path,
Expand Down Expand Up @@ -257,7 +329,8 @@ def get_phase1_input_data(

model.samples[study_item.file_name] = samples_file
for assay_item in study_item.study_assays.assays:
file_path = os.path.join(folder, assay_item.file_name)
file_path = self.get_file_path(assay_item.file_name, folder, study_id)

logger.debug(
"Load %s assay file headers for %s.", assay_item.file_name, study_id
)
Expand Down Expand Up @@ -395,7 +468,7 @@ def get_phase1_input_data(
model.folders_in_hierarchy.extend(list(folders_in_hierarchy))

for assignment_file in assignment_files:
absolute_path = os.path.join(folder, assignment_file)
absolute_path = self.get_file_path(assignment_file, folder, study_id)
logger.debug(
"Load %s assignment file headers for %s.", assignment_file, study_id
)
Expand Down Expand Up @@ -487,7 +560,7 @@ def get_sample_file_input(
model = self.get_phase1_input_data(study_id, folder, connection)

for study_item in model.investigation.studies:
file_path = os.path.join(folder, study_item.file_name)
file_path = self.get_file_path(study_item.file_name, folder, study_id)
samples_isa_table_sheet, messages = parse_isa_table_sheet_from_fs(
file_path,
samples_file_expected_patterns,
Expand Down Expand Up @@ -526,7 +599,8 @@ def get_phase2_input_data(
model = self.get_phase1_input_data(study_id, folder, connection)

for study_item in model.investigation.studies:
file_path = os.path.join(folder, study_item.file_name)
file_path = self.get_file_path(study_item.file_name, folder, study_id)

samples_isa_table_sheet, messages = parse_isa_table_sheet_from_fs(
file_path,
samples_file_expected_patterns,
Expand All @@ -551,7 +625,8 @@ def get_phase2_input_data(
self.filter_messages(messages)
)
for assay_item in study_item.study_assays.assays:
file_path = os.path.join(folder, assay_item.file_name)
file_path = self.get_file_path(assay_item.file_name, folder, study_id)

assay_isa_table_sheet, messages = parse_isa_table_sheet_from_fs(
file_path,
assay_file_expected_patterns,
Expand Down Expand Up @@ -596,19 +671,17 @@ def get_phase3_input_data(
model = self.get_phase1_input_data(study_id, folder, connection)

for assignment_file in model.metabolite_assignments:
absolute_path = os.path.join(folder, assignment_file)
absolute_path = self.get_file_path(assignment_file, folder, study_id)
(
metabolite_assignment_isa_table_sheet,
maf_isa_table_sheet,
messages,
) = parse_isa_table_sheet_from_fs(
absolute_path,
offset=assignment_sheet_offset,
limit=assignment_sheet_limit,
fix_unicode_exceptions=True,
)
metabolite_assignment_isa_table: IsaTableFile = (
metabolite_assignment_isa_table_sheet
)
metabolite_assignment_isa_table: IsaTableFile = maf_isa_table_sheet
model.parser_messages[assignment_file].extend(
self.filter_messages(messages)
)
Expand Down Expand Up @@ -722,13 +795,17 @@ def load_study(
calculate_data_folder_size: bool = False,
calculate_metadata_size: bool = False,
) -> MetabolightsStudyModel:
if not study_id or not study_path:
raise ValueError("invalid study_id or study_path")
real_path = os.path.realpath(study_path)
if not os.path.exists(real_path):
if not study_id:
raise ValueError("invalid study_id")
exist = False
study_path, exist = self.get_study_metadata_path(study_path, study_id)

if not study_path:
raise ValueError("invalid study_path")
if not exist:
model = MetabolightsStudyModel()
message = CriticalMessage(
short=f"Study folder does not exist for {study_id}"
short=f"Study folder does not exist for {study_id} {study_path}"
)
model.folder_reader_messages.append(message)
return model
Expand Down Expand Up @@ -786,9 +863,13 @@ def load_study(
return model

def update_investigation_file(
self, model: MetabolightsStudyModel, folder, file_name="i_Investigation.txt"
self,
model: MetabolightsStudyModel,
folder,
file_name="i_Investigation.txt",
study_id: Union[None, str] = None,
):
file = os.path.join(folder, file_name)
file = self.get_file_path(file_name, folder, study_id)
investigation, messages = parse_investigation_from_fs(
file, fix_unicode_exceptions=True
)
Expand Down
Loading

0 comments on commit 723e757

Please sign in to comment.