Skip to content

Commit

Permalink
Merge pull request #308 from helxplatform/gapexchange-study-name
Browse files Browse the repository at this point in the history
Try to pull study name from GapExchange file first
  • Loading branch information
YaphetKG authored Aug 14, 2023
2 parents c1b348e + 58548cc commit 08c1d79
Show file tree
Hide file tree
Showing 5 changed files with 428 additions and 6 deletions.
33 changes: 28 additions & 5 deletions src/dug/core/parsers/dbgap_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import re
import re, os
from typing import List
from xml.etree import ElementTree as ET

from dug import utils as utils
from pathlib import Path
from ._base import DugElement, FileParser, Indexable, InputFile

logger = logging.getLogger('dug')
Expand All @@ -13,27 +14,49 @@ class DbGaPParser(FileParser):
# Class for parsers DBGaP Data dictionary into a set of Dug Elements

@staticmethod
def parse_study_name_from_filename(filename: str):
def parse_study_name_from_filename(filename: str) -> str:
# Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from
dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*')
match = re.match(dbgap_file_pattern, filename)
if match is not None:
return match.group(1)
return None

@staticmethod
def parse_study_name_from_gap_exchange_file(filepath: Path) -> str:
# Parse the study name from the GapExchange file adjacent to the file passed in
parent_dir = filepath.parent.absolute()
gap_exchange_filename_str = "GapExchange_" + parent_dir.name
gap_exchange_filepath = None
for item in os.scandir(parent_dir):
if item.is_file and gap_exchange_filename_str in item.name:
gap_exchange_filepath = item.path
if gap_exchange_filepath is None:
return None
tree = ET.parse(gap_exchange_filepath, ET.XMLParser(encoding='iso-8859-5'))
tree_root = tree.getroot()
return tree_root.find("./Studies/Study/Configuration/StudyNameEntrez").text


def _get_element_type(self):
return "DbGaP"

def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
if "GapExchange" in str(input_file).split("/")[-1]:
msg = f"Skipping parsing for GapExchange file: {input_file}!"
logger.info(msg)
return []
tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5'))
root = tree.getroot()
study_id = root.attrib['study_id']
participant_set = root.get('participant_set','0')

# Parse study name from file handle
study_name = self.parse_study_name_from_filename(str(input_file))

# Parse study name from GapExchange file, and if that fails try from file handle
# If still None, raise an error message
study_name = self.parse_study_name_from_gap_exchange_file(Path(input_file))
if study_name is None:
study_name = self.parse_study_name_from_filename(str(input_file))
if study_name is None:
err_msg = f"Unable to parse DbGaP study name from data dictionary: {input_file}!"
logger.error(err_msg)
Expand Down
Loading

0 comments on commit 08c1d79

Please sign in to comment.