Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Anvil parser #208

Merged
merged 12 commits into from
May 2, 2022
2 changes: 2 additions & 0 deletions src/dug/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .scicrunch_parser import SciCrunchParser
from .topmed_tag_parser import TOPMedTagParser
from .topmed_csv_parser import TOPMedCSVParser
from .anvil_dbgap_parser import AnvilDbGaPParser

logger = logging.getLogger('dug')

Expand All @@ -22,6 +23,7 @@ def define_parsers(parser_dict: Dict[str, Parser]):
parser_dict["topmedtag"] = TOPMedTagParser()
parser_dict["topmedcsv"] = TOPMedCSVParser()
parser_dict["scicrunch"] = SciCrunchParser()
parser_dict["anvil"] = AnvilDbGaPParser()


class ParserNotFoundException(Exception):
Expand Down
6 changes: 6 additions & 0 deletions src/dug/core/parsers/anvil_dbgap_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .dbgap_parser import DbGaPParser


class AnvilDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "AnVIL"
5 changes: 4 additions & 1 deletion src/dug/core/parsers/dbgap_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def parse_study_name_from_filename(filename: str):
return match.group(1)
return None

def _get_element_type(self):
return "DbGaP"

def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
tree = ET.parse(input_file)
Expand All @@ -41,7 +44,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}",
name=variable.find('name').text,
desc=variable.find('description').text.lower(),
elem_type="DbGaP",
elem_type=self._get_element_type(),
collection_id=f"{study_id}.p{participant_set}",
collection_name=study_name)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="./datadict_v2.xsl"?><data_table id="pht009987.v1" study_id="phs001547.v1" participant_set="1" date_created="Tue Mar 17 09:38:19 2020"><description/><variable id="phv00427626.v1"><name>SUBJECT_ID</name><description>Subject ID</description><type>string</type></variable><variable id="phv00427627.v1"><name>CONSENT</name><description>Consent group as determined by DAC</description><type>encoded value</type><value code="1">Health/Medical/Biomedical (NPU) (HMB-NPU)</value></variable><variable id="phv00427628.v1"><name>AFFECTION_STATUS</name><description>Case control status of the subject for atrial fibrillation (AF)</description><type>encoded value</type><value code="1">Case</value></variable></data_table>
2 changes: 1 addition & 1 deletion tests/integration/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_filesystem_loader():
filepath=TEST_DATA_DIR,
)
files = list(targets)
assert len(files) == 9
assert len(files) == 10

with pytest.raises(ValueError):
targets = load_from_filesystem(
Expand Down
11 changes: 10 additions & 1 deletion tests/integration/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser
from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser
from tests.integration.conftest import TEST_DATA_DIR

def test_dbgap_parse_study_name_from_filename():
Expand Down Expand Up @@ -61,3 +61,12 @@ def test_topmed_tag_parser():
for element in elements:
assert element.name != element.id
assert element.description != element.id


def test_anvil_parser():
parser = AnvilDbGaPParser()
parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml")
elements = parser(parse_file)
assert len(elements) == 3
for element in elements:
assert element.type == "AnVIL"