diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index 36359cd8..867257e9 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -9,6 +9,7 @@ from .scicrunch_parser import SciCrunchParser from .topmed_tag_parser import TOPMedTagParser from .topmed_csv_parser import TOPMedCSVParser +from .anvil_dbgap_parser import AnvilDbGaPParser logger = logging.getLogger('dug') @@ -22,6 +23,7 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["topmedtag"] = TOPMedTagParser() parser_dict["topmedcsv"] = TOPMedCSVParser() parser_dict["scicrunch"] = SciCrunchParser() + parser_dict["anvil"] = AnvilDbGaPParser() class ParserNotFoundException(Exception): diff --git a/src/dug/core/parsers/anvil_dbgap_parser.py b/src/dug/core/parsers/anvil_dbgap_parser.py new file mode 100644 index 00000000..72d3d2e4 --- /dev/null +++ b/src/dug/core/parsers/anvil_dbgap_parser.py @@ -0,0 +1,6 @@ +from .dbgap_parser import DbGaPParser + + +class AnvilDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "AnVIL" diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index cc55ace4..2dd553eb 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -21,6 +21,9 @@ def parse_study_name_from_filename(filename: str): return match.group(1) return None + def _get_element_type(self): + return "DbGaP" + def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) tree = ET.parse(input_file) @@ -41,7 +44,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}", name=variable.find('name').text, desc=variable.find('description').text.lower(), - elem_type="DbGaP", + elem_type=self._get_element_type(), collection_id=f"{study_id}.p{participant_set}", collection_name=study_name) diff --git a/tests/integration/data/phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml b/tests/integration/data/phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml new file mode 100644 index 00000000..7614632e --- /dev/null +++ b/tests/integration/data/phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml @@ -0,0 +1,2 @@ + +SUBJECT_IDSubject IDstringCONSENTConsent group as determined by DACencoded valueHealth/Medical/Biomedical (NPU) (HMB-NPU)AFFECTION_STATUSCase control status of the subject for atrial fibrillation (AF)encoded valueCase diff --git a/tests/integration/test_loaders.py b/tests/integration/test_loaders.py index c0f34756..9287da44 100644 --- a/tests/integration/test_loaders.py +++ b/tests/integration/test_loaders.py @@ -18,7 +18,7 @@ def test_filesystem_loader(): filepath=TEST_DATA_DIR, ) files = list(targets) - assert len(files) == 9 + assert len(files) == 10 with pytest.raises(ValueError): targets = load_from_filesystem( diff --git a/tests/integration/test_parsers.py b/tests/integration/test_parsers.py index de6e4dd9..157e8735 100644 --- a/tests/integration/test_parsers.py +++ b/tests/integration/test_parsers.py @@ -1,4 +1,4 @@ -from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser +from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser from tests.integration.conftest import TEST_DATA_DIR def test_dbgap_parse_study_name_from_filename(): @@ -61,3 +61,12 @@ def test_topmed_tag_parser(): for element in elements: assert element.name != element.id assert element.description != element.id + + +def test_anvil_parser(): + parser = AnvilDbGaPParser() + parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml") + elements = parser(parse_file) + assert len(elements) == 3 + for element in elements: + assert element.type == "AnVIL"