Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bacpac fix #320

Merged
merged 11 commits into from
Sep 27, 2023
7 changes: 6 additions & 1 deletion src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,17 @@ class Config:
# Parse nodes matching criteria in kg
"node_type": "biolink:Publication",
"curie_prefix": "HEALCDE",
# list of attributes that are lists to be casted to strings
"list_field_choose_first": [
"files"
],
"attribute_mapping": {
# "DugElement Attribute" : "KG Node attribute"
"name": "name",
"desc": "summary",
"collection_name": "cde_category",
"collection_id": "cde_category"
"collection_id": "cde_category",
"collection_action": "files"
}
}
})
Expand Down
10 changes: 7 additions & 3 deletions src/dug/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def expand_to_dug_element(self,
target_node_type = casting_config["node_type"]
curie_filter = casting_config["curie_prefix"]
attribute_mapping = casting_config["attribute_mapping"]
array_to_string = casting_config["list_field_choose_first"]
target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", ""))
for ident_id, identifier in concept.identifiers.items():

Expand Down Expand Up @@ -248,9 +249,12 @@ def expand_to_dug_element(self,
if target_node_type in node["category"]:
if node['id'].startswith(curie_filter):
element_attribute_args = {"elem_id": node_id, "elem_type": dug_element_type}
element_attribute_args.update({key: node.get(attribute_mapping[key], "")
for key in attribute_mapping
})
for key in attribute_mapping:
mapped_value = node.get(attribute_mapping[key], "")
# treat all attributes as strings
if key in array_to_string and isinstance(mapped_value, list) and len(mapped_value) > 0:
YaphetKG marked this conversation as resolved.
Show resolved Hide resolved
mapped_value = mapped_value[0]
element_attribute_args.update({key: mapped_value})
element = DugElement(
**element_attribute_args
)
Expand Down
3 changes: 2 additions & 1 deletion src/dug/core/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def build_element_extraction_parameters(self, source=None):
"casting_config": {
"node_type": queries[dug_type]["node_type"],
"curie_prefix": queries[dug_type]["curie_prefix"],
"attribute_mapping": queries[dug_type]["attribute_mapping"]
"attribute_mapping": queries[dug_type]["attribute_mapping"],
"list_field_choose_first": queries[dug_type]["list_field_choose_first"]
# CDE's are only ones
# but if we had two biolink:Publication nodes we want to conditionally
# cast to other output_dug_type, we could extend this config
Expand Down
6 changes: 3 additions & 3 deletions src/dug/core/parsers/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ class DugElement:
# Basic class for holding information for an object you want to make searchable via Dug
# Could be a DbGaP variable, DICOM image, App, or really anything
# Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series)
def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc=""):
def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action=""):
self.id = elem_id
self.name = name
self.description = desc
self.type = elem_type
self.collection_id = collection_id
self.collection_name = collection_name
self.collection_desc = collection_desc
self.action = ""
self.collection_action = ""
self.action = action
self.collection_action = collection_action
self.concepts = {}
self.ml_ready_desc = desc
self.search_terms = []
Expand Down
14 changes: 8 additions & 6 deletions src/dug/core/parsers/bacpac_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ class BACPACParser(FileParser):
# Class for parsing BACPAC data dictionaries in dbGaP XML format into a set of Dug Elements.

@staticmethod
def parse_study_name_from_filename(filename: str):
def get_study_file_name():
# Parse the form name from the xml filename
return filename.split('/')[-1].replace('.xml', '')
return "Back Pain Consortium (BACPAC) Minimum Dataset"

def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
tree = ET.parse(input_file)
root = tree.getroot()
study_id = root.attrib['study_id']
study_id = "HEALPLATFORM:HDP00692"

# Parse study name from file handle
study_name = self.parse_study_name_from_filename(str(input_file))
study_name = self.get_study_file_name()

if study_name is None:
err_msg = f"Unable to parse BACPAC Form name from data dictionary: {input_file}!"
Expand All @@ -38,8 +38,10 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
desc=description.lower(),
elem_type="BACPAC",
collection_id=f"{study_id}",
collection_name=study_name)

collection_name=study_name
)
elem.action = "https://healdata.org/portal/discovery/HDP00692"
elem.collection_action = "https://healdata.org/portal/discovery/HDP00692"
# Add to set of variables
logger.debug(elem)
elements.append(elem)
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ def test_expand_to_dug_element(crawler):
"desc": "summary",
"collection_name": "cde_category",
"collection_id": "cde_category"
}
},
"list_field_choose_first": []
},
dug_element_type="test-element",
tranql_source="test:graph"
Expand Down