diff --git a/src/dug/_version.py b/src/dug/_version.py index 43ce13db..56923025 100644 --- a/src/dug/_version.py +++ b/src/dug/_version.py @@ -1 +1 @@ -__version__ = "2.9.0" +__version__ = "2.9.1rc" diff --git a/src/dug/config.py b/src/dug/config.py index 20b285c9..129ba080 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -67,7 +67,15 @@ class Config: # Dug element type to cast the query kg nodes to "cde": { # Parse nodes matching criteria in kg - "node_type": "biolink:Publication" + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + "attribute_mapping": { + # "DugElement Attribute" : "KG Node attribute" + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category" + } } }) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 9b39ea00..3ae70574 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -206,6 +206,8 @@ def expand_to_dug_element(self, elements = [] # using node_type as the primary criteria for matching nodes to element type. target_node_type = casting_config["node_type"] + curie_filter = casting_config["curie_prefix"] + attribute_mapping = casting_config["attribute_mapping"] target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", "")) for ident_id, identifier in concept.identifiers.items(): @@ -244,14 +246,14 @@ def expand_to_dug_element(self, # and return the variables. for node_id, node in answer.nodes.items(): if target_node_type in node["category"]: - # @TODO make element creation more generic - # @TODO need to encode more data into the graph nodes, to parse them properly - element = DugElement( - elem_id=node_id, - name=node.get('name', ""), - desc=node.get('summary', ""), - elem_type=dug_element_type - ) - element.add_concept(concept) - elements.append(element) + if node['id'].startswith(curie_filter): + element_attribute_args = {"elem_id": node_id, "elem_type": dug_element_type} + element_attribute_args.update({key: node.get(attribute_mapping[key], "") + for key in attribute_mapping + }) + element = DugElement( + **element_attribute_args + ) + element.add_concept(concept) + elements.append(element) return elements diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index e7e0b3f8..3c2e16d2 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -81,7 +81,7 @@ def build_search_obj(self, indices) -> Search: return Search(self.config, indices=indices) def build_element_extraction_parameters(self, source=None): - # Method reformats the node_to_element_queries object + # Method reformats the node_to_element_queries object # Uses tranql source use for concept crawling if source is None: source = TRANQL_SOURCE @@ -96,7 +96,9 @@ def build_element_extraction_parameters(self, source=None): { "output_dug_type": dug_type, "casting_config": { - "node_type": queries[dug_type]['node_type'] + "node_type": queries[dug_type]["node_type"], + "curie_prefix": queries[dug_type]["curie_prefix"], + "attribute_mapping": queries[dug_type]["attribute_mapping"] # CDE's are only ones # but if we had two biolink:Publication nodes we want to conditionally # cast to other output_dug_type, we could extend this config diff --git a/src/dug/core/parsers/scicrunch_parser.py b/src/dug/core/parsers/scicrunch_parser.py index 696ee826..0405e3c8 100644 --- a/src/dug/core/parsers/scicrunch_parser.py +++ b/src/dug/core/parsers/scicrunch_parser.py @@ -68,7 +68,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}", name=variable.find('name').text, desc=variable.find('description').text.lower(), - elem_type="DbGaP", + elem_type="SPARC", collection_id=f"{study_id}.p{participant_set}", collection_name=study_name) diff --git a/tests/unit/mocks/data/tranql_response.json b/tests/unit/mocks/data/tranql_response.json index ed1e5c0c..da556ddd 100644 --- a/tests/unit/mocks/data/tranql_response.json +++ b/tests/unit/mocks/data/tranql_response.json @@ -8,14 +8,14 @@ } }, "nodes": { + "publication": { + "category": "biolink:Publication" + }, "disease": { "category": "biolink:Disease", "id": [ "MONDO:0008187" ] - }, - "publication": { - "category": "biolink:Publication" } } }, @@ -25,11 +25,11 @@ "name": "panic disorder 1", "category": [ "biolink:Disease", - "biolink:Entity", + "biolink:DiseaseOrPhenotypicFeature", "biolink:ThingWithTaxon", "biolink:BiologicalEntity", - "biolink:NamedThing", - "biolink:DiseaseOrPhenotypicFeature" + "biolink:Entity", + "biolink:NamedThing" ], "attributes": [ { @@ -72,6 +72,30 @@ "value": "Filename: Photosensitivity_PAQ_CDE_v1.0.json; File path: Supplemental Questionnaires/Sensory/Photosensitivity Assessment Questionnaire (PAQ); Photosensitivity Assessment Questionnaire", "name": "summary" }, + { + "type": "NA", + "value": [ + "Supplemental Questionnaires", + "Sensory", + "Photosensitivity Assessment Questionnaire (PAQ)" + ], + "name": "cde_categories" + }, + { + "type": "NA", + "value": [ + "Supplemental Questionnaires", + "Adult/Pediatric", + "Acute/Chronic Pain", + "Photosensitivity Assessment Questionnaire (PAQ)" + ], + "name": "cde_category_extended" + }, + { + "type": "NA", + "value": "Supplemental Questionnaires", + "name": "cde_category" + }, { "type": "NA", "value": [ diff --git a/tests/unit/test_crawler.py b/tests/unit/test_crawler.py index 80ff05e6..059a42f5 100644 --- a/tests/unit/test_crawler.py +++ b/tests/unit/test_crawler.py @@ -84,7 +84,16 @@ def test_expand_to_dug_element(crawler): concept.add_identifier(identifier) new_elements = crawler.expand_to_dug_element( concept=concept, - casting_config={"node_type": "biolink:Publication"}, + casting_config={ + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + "attribute_mapping": { + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category" + } + }, dug_element_type="test-element", tranql_source="test:graph" )