From 60b473aa5ea9fc5bcf760bea331723f64cb8dfe6 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 8 Feb 2022 15:12:27 -0500 Subject: [PATCH 1/4] Release/2.8.0 (#198) * Bumping version * support for extracting dug elements from graph (#197) * support for extracting dug elements from graph * adding flag for enabling dug element extraction from graph * adding new config for node_to dug element parsing * adding more parameters to crawler to able configuration to element extraction logic * add tests * add tests for crawler Co-authored-by: Yaphetkg * Update _version.py * Update _version.py updating version for final push to master * Update factory.py Adding more comments Co-authored-by: Carl Schreep Co-authored-by: Yaphetkg --- src/dug/_version.py | 2 +- src/dug/cli.py | 11 ++ src/dug/config.py | 8 ++ src/dug/core/annotate.py | 8 +- src/dug/core/crawler.py | 89 +++++++++++- src/dug/core/factory.py | 28 +++- src/dug/core/tranql.py | 25 +--- src/dug/utils.py | 24 ++++ tests/unit/mocks/MockCrawler.py | 63 ++++++++ tests/unit/mocks/__init__.py | 0 tests/unit/mocks/data/tranql_response.json | 158 +++++++++++++++++++++ tests/unit/test_cli.py | 13 ++ tests/unit/test_crawler.py | 91 ++++++++++++ 13 files changed, 491 insertions(+), 29 deletions(-) create mode 100644 tests/unit/mocks/MockCrawler.py create mode 100644 tests/unit/mocks/__init__.py create mode 100644 tests/unit/mocks/data/tranql_response.json create mode 100644 tests/unit/test_crawler.py diff --git a/src/dug/_version.py b/src/dug/_version.py index 2614ce9d..892994aa 100644 --- a/src/dug/_version.py +++ b/src/dug/_version.py @@ -1 +1 @@ -__version__ = "2.7.0" +__version__ = "2.8.0" diff --git a/src/dug/cli.py b/src/dug/cli.py index 651fdabe..1033e85f 100755 --- a/src/dug/cli.py +++ b/src/dug/cli.py @@ -59,6 +59,14 @@ def get_argparser(): default=None ) + crawl_parser.add_argument( + "-x", "--extract-from-graph", + help="[Optional] Extract dug elements for tranql using concepts from annotation", + dest="extract_dug_elements", + default=False, + action="store_true" + ) + # Search subcommand search_parser = subparsers.add_parser('search', help='Apply semantic search') search_parser.set_defaults(func=search) @@ -95,6 +103,9 @@ def get_argparser(): def crawl(args): config = Config.from_env() + if not args.extract_dug_elements: + # disable extraction + config.node_to_element_queries = {} factory = DugFactory(config) dug = Dug(factory) dug.crawl(args.target, args.parser_type, args.element_type) diff --git a/src/dug/config.py b/src/dug/config.py index b323ac7c..20b285c9 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -63,6 +63,14 @@ class Config: "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], }) + node_to_element_queries: dict = field(default_factory=lambda: { + # Dug element type to cast the query kg nodes to + "cde": { + # Parse nodes matching criteria in kg + "node_type": "biolink:Publication" + } + }) + concept_expander: dict = field(default_factory=lambda: { "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", "min_tranql_score": 0.0 diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index b891beec..0adef1f1 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -130,7 +130,7 @@ def __init__(self, url, min_tranql_score=0.2): def is_acceptable_answer(self, answer): return True - def expand_identifier(self, identifier, query_factory, kg_filename): + def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): answer_kgs = [] @@ -182,9 +182,11 @@ def expand_identifier(self, identifier, query_factory, kg_filename): # Temporarily surround in try/except because sometimes the answer graphs # contain invalid references to edges/nodes # This will be fixed in Robokop but for now just silently warn if answer is invalid + node_attributes_filter = None if include_all_attributes else self.include_node_keys + edge_attributes_filter = None if include_all_attributes else self.include_edge_keys answer_kg = kg.get_answer_subgraph(answer, - include_node_keys=self.include_node_keys, - include_edge_keys=self.include_edge_keys) + include_node_keys=node_attributes_filter, + include_edge_keys=edge_attributes_filter) # Add subgraph to list of acceptable answers to query answer_kgs.append(answer_kg) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 633b908b..9b39ea00 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -4,6 +4,8 @@ import traceback from dug.core.parsers import Parser, DugElement, DugConcept +import dug.core.tranql as tql +from dug.utils import biolink_snake_case logger = logging.getLogger('dug') @@ -11,7 +13,8 @@ class Crawler: def __init__(self, crawl_file: str, parser: Parser, annotator, tranqlizer, tranql_queries, - http_session, exclude_identifiers=None, element_type=None): + http_session, exclude_identifiers=None, element_type=None, + element_extraction=None): if exclude_identifiers is None: exclude_identifiers = [] @@ -24,6 +27,7 @@ def __init__(self, crawl_file: str, parser: Parser, annotator, self.tranql_queries = tranql_queries self.http_session = http_session self.exclude_identifiers = exclude_identifiers + self.element_extraction = element_extraction self.elements = [] self.concepts = {} self.crawlspace = "crawl" @@ -52,7 +56,10 @@ def crawl(self): # Annotate elements self.annotate_elements() - # Expand concepts + # if elements are extracted from the graph this array will contain the new dug elements + dug_elements_from_graph = [] + + # Expand concepts to other concepts concept_file = open(f"{self.crawlspace}/concept_file.json", "w") for concept_id, concept in self.concepts.items(): # Use TranQL queries to fetch knowledge graphs containing related but not synonymous biological terms @@ -70,6 +77,21 @@ def crawl(self): # Write concept out to a file concept_file.write(f"{json.dumps(concept.get_searchable_dict(), indent=2)}") + if self.element_extraction: + for element_extraction_config in self.element_extraction: + casting_config = element_extraction_config['casting_config'] + tranql_source = element_extraction_config['tranql_source'] + dug_element_type = element_extraction_config['output_dug_type'] + dug_elements_from_graph += self.expand_to_dug_element( + concept=concept, + casting_config=casting_config, + dug_element_type=dug_element_type, + tranql_source=tranql_source + ) + + # add new elements to parsed elements + self.elements += dug_elements_from_graph + # Set element optional terms now that concepts have been expanded # Open variable file for writing variable_file = open(f"{self.crawlspace}/element_file.json", "w") @@ -117,7 +139,6 @@ def annotate_elements(self): for concept_to_add in concepts_to_add: element.add_concept(concept_to_add) - def annotate_element(self, element): # Annotate with a set of normalized ontology identifiers @@ -172,3 +193,65 @@ def expand_concept(self, concept): # Add any answer knowledge graphs to for answer in answers: concept.add_kg_answer(answer, query_name=query_name) + + def expand_to_dug_element(self, + concept, + casting_config, + dug_element_type, + tranql_source): + """ + Given a concept look up the knowledge graph to construct dug elements out of kg results + does concept -> target_node_type crawls and converts target_node_type to dug element of type `dug_element_type` + """ + elements = [] + # using node_type as the primary criteria for matching nodes to element type. + target_node_type = casting_config["node_type"] + target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", "")) + for ident_id, identifier in concept.identifiers.items(): + + # Check to see if the concept identifier has types defined, this is used to create + # tranql queries below. + if not identifier.types: + continue + + # convert the first type to snake case to be used in tranql query. + # first type is the leaf type, this is coming from Node normalization. + node_type = biolink_snake_case(identifier.types[0].replace("biolink:", "")) + try: + # Tranql query factory currently supports select node types as valid query + # Types missing from QueryFactory.data_types will be skipped with this try catch + query = tql.QueryFactory([node_type, target_node_type_snake_case], tranql_source) + except tql.InvalidQueryError as exception: + logger.debug(f"Skipping {ident_id}, {exception}") + continue + + # check if tranql query object can use the curie. + if query.is_valid_curie(ident_id): + logger.info(f"Expanding {ident_id} to other dug elements") + # Fetch kg and answer + # Fetch kg and answer + # replace ":" with "~" to avoid windows os errors + kg_outfile = f"{self.crawlspace}/" + f"{ident_id}_{target_node_type}.json".replace(":", "~") + + # query tranql, answers will include all node and edge attributes + answers = self.tranqlizer.expand_identifier(ident_id, query, + kg_filename=kg_outfile, + include_all_attributes=True) + + # for each answer construct a dug element + for answer in answers: + # here we will inspect the answers create new dug elements based on target node type + # and return the variables. + for node_id, node in answer.nodes.items(): + if target_node_type in node["category"]: + # @TODO make element creation more generic + # @TODO need to encode more data into the graph nodes, to parse them properly + element = DugElement( + elem_id=node_id, + name=node.get('name', ""), + desc=node.get('summary', ""), + elem_type=dug_element_type + ) + element.add_concept(concept) + elements.append(element) + return elements diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index d7050e36..e7e0b3f8 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -40,7 +40,8 @@ def build_crawler(self, target, parser: Parser, element_type: str, tranql_source tranql_queries=self.build_tranql_queries(tranql_source), http_session=self.build_http_session(), exclude_identifiers=self.config.tranql_exclude_identifiers, - element_type=element_type + element_type=element_type, + element_extraction=self.build_element_extraction_parameters(), ) return crawler @@ -78,3 +79,28 @@ def build_tranql_queries(self, source=None) -> Dict[str, tql.QueryFactory]: def build_search_obj(self, indices) -> Search: return Search(self.config, indices=indices) + + def build_element_extraction_parameters(self, source=None): + # Method reformats the node_to_element_queries object + # Uses tranql source use for concept crawling + if source is None: + source = TRANQL_SOURCE + queries = self.config.node_to_element_queries + # reformat config as array , in the crawler this is looped over + # to make calls to the expansion logic. + # casting config will be a set of conditions to perform casting on. + # Currently we are casting based on node type returned from the tranql query + # we might want to filter those based on curie type or other conditions , if + # node type is too broad. + return [ + { + "output_dug_type": dug_type, + "casting_config": { + "node_type": queries[dug_type]['node_type'] + # CDE's are only ones + # but if we had two biolink:Publication nodes we want to conditionally + # cast to other output_dug_type, we could extend this config + }, + "tranql_source": source + } for dug_type in queries + ] diff --git a/src/dug/core/tranql.py b/src/dug/core/tranql.py index 8e15d82a..35134555 100644 --- a/src/dug/core/tranql.py +++ b/src/dug/core/tranql.py @@ -1,4 +1,5 @@ -import json, re +import json +from dug.utils import biolink_snake_case class MissingNodeReferenceError(BaseException): @@ -179,25 +180,7 @@ def get_kg(self): return old_kg_model def _snake_case(self, arg: str): - """Convert string to snake_case. - Non-alphanumeric characters are replaced with _. - CamelCase is replaced with snake_case. - """ - # replace non-alphanumeric characters with _ - tmp = re.sub(r'\W', '_', arg) - # replace X with _x - tmp = re.sub( - r'(?<=[a-z])[A-Z](?=[a-z])', - lambda c: '_' + c.group(0).lower(), - tmp - ) - # lower-case first character - tmp = re.sub( - r'^[A-Z](?=[a-z])', - lambda c: c.group(0).lower(), - tmp - ) - return tmp + return biolink_snake_case(arg) class InvalidQueryError(BaseException): @@ -207,7 +190,7 @@ class InvalidQueryError(BaseException): class QueryFactory: # Class member list of valid data types that can be included in query - data_types = ["phenotypic_feature", "gene", "disease", "chemical_substance", + data_types = ["publication", "phenotypic_feature", "gene", "disease", "chemical_substance", "drug_exposure", "biological_process", "anatomical_entity", "small_molecule", "chemical_mixture", "chemical_entity"] diff --git a/src/dug/utils.py b/src/dug/utils.py index 5ab0703f..9b224387 100644 --- a/src/dug/utils.py +++ b/src/dug/utils.py @@ -1,3 +1,5 @@ +import re + class ObjectFactory: def __init__(self): self._builders = {} @@ -34,3 +36,25 @@ def get_dbgap_study_link(study_id): def get_nida_study_link(study_id): base_url = "https://datashare.nida.nih.gov/study" return f'{base_url}/{study_id}' + + +def biolink_snake_case(arg): + """Convert such SnakeCase to snake_case. + Non-alphanumeric characters are replaced with _. + CamelCase is replaced with snake_case. + """ + # replace non-alphanumeric characters with _ + tmp = re.sub(r'\W', '_', arg) + # replace X with _x + tmp = re.sub( + r'(?<=[a-z])[A-Z](?=[a-z])', + lambda c: '_' + c.group(0).lower(), + tmp + ) + # lower-case first character + tmp = re.sub( + r'^[A-Z](?=[a-z])', + lambda c: c.group(0).lower(), + tmp + ) + return tmp \ No newline at end of file diff --git a/tests/unit/mocks/MockCrawler.py b/tests/unit/mocks/MockCrawler.py new file mode 100644 index 00000000..1c69dabe --- /dev/null +++ b/tests/unit/mocks/MockCrawler.py @@ -0,0 +1,63 @@ +from unittest.mock import MagicMock, Mock + +import pytest +import os +import json + + +from dug.core.annotate import Identifier +from dug.core.tranql import QueryFactory, QueryKG + +# Makes some simple mokes +ParserMock = MagicMock() +HTTPSessionMock = MagicMock() + +# mocking tranql queries +TranqlQueriesMock = {} +for key, query in { + "disease": ["disease", "phenotypic_feature"], + "pheno": ["phenotypic_feature", "disease"] +}.items(): + TranqlQueriesMock[key] = QueryFactory(query, source="test") + + +# for testing no id exclusion +ExcludedIDs = [] + +ANNOTATED_IDS = [ + Identifier("MONDO:0", "0", ["disease"]), + Identifier("PUBCHEM.COMPOUND:1", "1", ["chemical"]) + ] +for ids in ANNOTATED_IDS: + ids.type = ids.types[0] +# annotator with annotate method returning mocked concepts +AnnotatorMock = MagicMock() +AnnotatorMock.annotate = Mock(return_value=ANNOTATED_IDS) + +# tranqlizer returning mock kg when expanding concepts +TranqlizerMock = MagicMock() + +# Get example tranql answer +with open(os.path.join(os.path.dirname(__file__), "data", "tranql_response.json")) as stream: + tranql_json = json.load(stream) + kg_answer = QueryKG(kg_json=tranql_json) + TRANQL_ANSWERS = [] + for answer in kg_answer.answers: + TRANQL_ANSWERS.append(kg_answer.get_answer_subgraph(answer)) + +TranqlizerMock.expand_identifier = Mock(return_value=TRANQL_ANSWERS) + +#mock a crawler with mock dependencies +@pytest.fixture +def crawler_init_args_no_graph_extraction(): + return { + "crawl_file": "test", + "parser": ParserMock, + "annotator": AnnotatorMock, + "tranqlizer": TranqlizerMock, + "tranql_queries": TranqlQueriesMock, + "http_session": HTTPSessionMock, + "exclude_identifiers": ExcludedIDs, + "element_type": "TestElement", + "element_extraction": None + } diff --git a/tests/unit/mocks/__init__.py b/tests/unit/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/mocks/data/tranql_response.json b/tests/unit/mocks/data/tranql_response.json new file mode 100644 index 00000000..ed1e5c0c --- /dev/null +++ b/tests/unit/mocks/data/tranql_response.json @@ -0,0 +1,158 @@ +{ + "message": { + "query_graph": { + "edges": { + "e1_publication_disease": { + "subject": "publication", + "object": "disease" + } + }, + "nodes": { + "disease": { + "category": "biolink:Disease", + "id": [ + "MONDO:0008187" + ] + }, + "publication": { + "category": "biolink:Publication" + } + } + }, + "knowledge_graph": { + "nodes": { + "MONDO:0008187": { + "name": "panic disorder 1", + "category": [ + "biolink:Disease", + "biolink:Entity", + "biolink:ThingWithTaxon", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:DiseaseOrPhenotypicFeature" + ], + "attributes": [ + { + "type": "NA", + "value": [ + "MONDO:0008187", + "OMIM:167870", + "UMLS:C1868649", + "UMLS:C1868650", + "UMLS:C1868651", + "MESH:C566834", + "MESH:C566835" + ], + "name": "equivalent_identifiers" + }, + { + "type": "NA", + "value": [ + "Monarch NER service + Translator normalization API" + ], + "name": "provided_by" + }, + { + "name": "reasoner", + "value": [ + "redis:" + ], + "type": "EDAM:data_0006" + } + ] + }, + "HEALCDE:Photosensitivity_PAQ_CDE_v1.0.json": { + "name": "Filename: Photosensitivity_PAQ_CDE_v1.0.json", + "category": [ + "biolink:Publication" + ], + "attributes": [ + { + "type": "NA", + "value": "Filename: Photosensitivity_PAQ_CDE_v1.0.json; File path: Supplemental Questionnaires/Sensory/Photosensitivity Assessment Questionnaire (PAQ); Photosensitivity Assessment Questionnaire", + "name": "summary" + }, + { + "type": "NA", + "value": [ + "Graph" + ], + "name": "provided_by" + }, + { + "name": "reasoner", + "value": [ + "redis:" + ], + "type": "EDAM:data_0006" + } + ] + } + }, + "edges": { + "b93815fc2af8": { + "attributes": [ + { + "type": "NA", + "name": "category", + "value": [ + "biolink:InformationContentEntityToNamedThingAssociation" + ] + }, + { + "type": "NA", + "name": "name", + "value": "panic disorder." + }, + { + "type": "NA", + "name": "knowledge_source", + "value": [ + "Monarch NER service + Translator normalization API" + ] + }, + { + "type": "NA", + "name": "predicate_label", + "value": "mentions" + }, + { + "name": "reasoner", + "value": [ + "redis:" + ], + "type": "EDAM:data_0006" + } + ], + "predicate": "biolink:mentions", + "subject": "HEALCDE:Photosensitivity_PAQ_CDE_v1.0.json", + "object": "MONDO:0008187" + } + } + }, + "results": [ + { + "node_bindings": { + "publication": [ + { + "id": "HEALCDE:Photosensitivity_PAQ_CDE_v1.0.json" + } + ], + "disease": [ + { + "id": "MONDO:0008187" + } + ] + }, + "edge_bindings": { + "e1_publication_disease": [ + { + "id": "b93815fc2af8" + } + ] + }, + "score": 0 + } + ] + } +} \ No newline at end of file diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index b95df8bc..99f903dd 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -27,6 +27,19 @@ def test_dug_cli_main_crawl(mock_crawl): main(["crawl", "somefile.csv", "--parser", "topmedtag"]) assert mock_crawl.called_once() +@mark.cli +@patch('dug.cli.crawl') +def test_dug_cli_main_extract_dug_elements(mock_crawl): + main(["crawl", "somefile.csv", "--parser", "topmedtag", "-x"]) + assert mock_crawl.called_once() + assert mock_crawl.call_args_list[0].args[0].extract_dug_elements + +@mark.cli +@patch('dug.cli.crawl') +def test_dug_cli_main_extract_dug_elements_none(mock_crawl): + main(["crawl", "somefile.csv", "--parser", "topmedtag"]) + assert mock_crawl.called_once() + assert not mock_crawl.call_args_list[0].args[0].extract_dug_elements @mark.cli @patch('dug.cli.search') diff --git a/tests/unit/test_crawler.py b/tests/unit/test_crawler.py new file mode 100644 index 00000000..80ff05e6 --- /dev/null +++ b/tests/unit/test_crawler.py @@ -0,0 +1,91 @@ +import pytest +from unittest.mock import patch + +from dug.core import DugConcept +from dug.core.parsers import DugElement +from tests.unit.mocks.MockCrawler import * + + +from dug.core.crawler import Crawler + + +@pytest.fixture +def crawler(crawler_init_args_no_graph_extraction): + return Crawler( + **crawler_init_args_no_graph_extraction + ) + + +def test_init(crawler): + assert crawler.crawlspace == "crawl" + + +def test_annotate_element(crawler): + element = DugElement( + "test-id", + "name", + "some_desc", + "test-type", + "collection-id", + "collection-name", + "collection-desc" + ) + crawler.annotate_element(element) + AnnotatorMock.annotate.assert_called_with(**{ + "text": element.ml_ready_desc, + "http_session": HTTPSessionMock + }) + assert len(crawler.concepts) == len(ANNOTATED_IDS) + assert len(element.concepts) == len(ANNOTATED_IDS) + + +def test_annotate_elements(crawler): + elements = [DugElement( + "test-1", + "name", + "some_desc", + "test-type", + "collection-id", + "collection-name", + "collection-desc" + ), DugElement( + "test-2", + "name", + "some_desc", + "test-type", + "collection-id", + "collection-name", + "collection-desc" + )] + crawler.elements = elements + crawler.annotate_elements() + # annotate elements mutates the original elements + for element in elements: + # assert all elements have the fake concepts added + assert len(element.concepts) == len(ANNOTATED_IDS) + # assert concept labels are set on the element's search terms + for ANNOTATED_ID in ANNOTATED_IDS: + assert ANNOTATED_ID.label in element.search_terms + + +def test_expand_concept(crawler): + identifier = ANNOTATED_IDS[0] + concept = DugConcept(concept_id=identifier.id, name="test-concept", desc="" , concept_type=identifier.types[0]) + concept.add_identifier(identifier) + crawler.expand_concept(concept=concept) + TranqlizerMock.expand_identifier.assert_called_with( + identifier.id, TranqlQueriesMock.get("disease"), crawler.crawlspace + '/' + identifier.id + '_disease.json' + ) + assert len(concept.kg_answers) == len(TRANQL_ANSWERS) + +def test_expand_to_dug_element(crawler): + identifier = ANNOTATED_IDS[0] + concept = DugConcept(concept_id=identifier.id, name="test-concept", desc="", concept_type=identifier.types[0]) + concept.add_identifier(identifier) + new_elements = crawler.expand_to_dug_element( + concept=concept, + casting_config={"node_type": "biolink:Publication"}, + dug_element_type="test-element", + tranql_source="test:graph" + ) + assert len(new_elements) == len(TRANQL_ANSWERS) From bb661be273c96295e3a550290ed5098c7b17b15b Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 29 Mar 2022 10:19:50 -0400 Subject: [PATCH 2/4] Release/v2.9.0 (#201) * Bumping version * support for extracting dug elements from graph (#197) * support for extracting dug elements from graph * adding flag for enabling dug element extraction from graph * adding new config for node_to dug element parsing * adding more parameters to crawler to able configuration to element extraction logic * add tests * add tests for crawler Co-authored-by: Yaphetkg * Display es scores (#199) * Include ES scores in variable results * Round ES score to 6 * Update _version.py (#200) * Update _version.py Co-authored-by: Carl Schreep Co-authored-by: Yaphetkg Co-authored-by: Ginnie Hench --- src/dug/_version.py | 2 +- src/dug/core/search.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/dug/_version.py b/src/dug/_version.py index 892994aa..43ce13db 100644 --- a/src/dug/_version.py +++ b/src/dug/_version.py @@ -1 +1 @@ -__version__ = "2.8.0" +__version__ = "2.9.0" diff --git a/src/dug/core/search.py b/src/dug/core/search.py index 6148f212..f5c83b2a 100644 --- a/src/dug/core/search.py +++ b/src/dug/core/search.py @@ -411,13 +411,13 @@ def search_variables(self, index, concept="", query="", size=None, data_type=Non "identifiers": concept } } - + body = json.dumps({'query': query}) total_items = self.es.count(body=body, index=index) search_results = self.es.search( index=index, body=body, - filter_path=['hits.hits._id', 'hits.hits._type', 'hits.hits._source'], + filter_path=['hits.hits._id', 'hits.hits._type', 'hits.hits._source', 'hits.hits._score'], from_=offset, size=size ) @@ -441,7 +441,8 @@ def search_variables(self, index, concept="", query="", size=None, data_type=Non "description": elem_s['element_desc'], "e_link": elem_s['element_action'], "id": elem_id, - "name": elem_s['element_name'] + "name": elem_s['element_name'], + "score": round(elem['_score'], 6) } # Case: collection not in dictionary for given data_type From 2e359b5586cf6ca0a5e70af770d77530204ce5be Mon Sep 17 00:00:00 2001 From: Yaphetkg Date: Thu, 31 Mar 2022 08:30:56 -0400 Subject: [PATCH 3/4] adding more config options for node extraction --- src/dug/config.py | 10 +++++- src/dug/core/crawler.py | 30 ++++++++++-------- src/dug/core/factory.py | 4 ++- tests/unit/mocks/data/tranql_response.json | 36 ++++++++++++++++++---- tests/unit/test_crawler.py | 9 +++++- 5 files changed, 68 insertions(+), 21 deletions(-) diff --git a/src/dug/config.py b/src/dug/config.py index 20b285c9..129ba080 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -67,7 +67,15 @@ class Config: # Dug element type to cast the query kg nodes to "cde": { # Parse nodes matching criteria in kg - "node_type": "biolink:Publication" + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + "attribute_mapping": { + # "DugElement Attribute" : "KG Node attribute" + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category" + } } }) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 9b39ea00..dcc25e33 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -82,11 +82,15 @@ def crawl(self): casting_config = element_extraction_config['casting_config'] tranql_source = element_extraction_config['tranql_source'] dug_element_type = element_extraction_config['output_dug_type'] + curie_filter = element_extraction_config['curie_prefix'] + attribute_mapping = element_extraction_config['attribute_mapping'] dug_elements_from_graph += self.expand_to_dug_element( concept=concept, casting_config=casting_config, dug_element_type=dug_element_type, - tranql_source=tranql_source + tranql_source=tranql_source, + curie_filter=curie_filter, + attribute_mapping=attribute_mapping ) # add new elements to parsed elements @@ -198,7 +202,9 @@ def expand_to_dug_element(self, concept, casting_config, dug_element_type, - tranql_source): + tranql_source, + curie_filter, + attribute_mapping): """ Given a concept look up the knowledge graph to construct dug elements out of kg results does concept -> target_node_type crawls and converts target_node_type to dug element of type `dug_element_type` @@ -244,14 +250,14 @@ def expand_to_dug_element(self, # and return the variables. for node_id, node in answer.nodes.items(): if target_node_type in node["category"]: - # @TODO make element creation more generic - # @TODO need to encode more data into the graph nodes, to parse them properly - element = DugElement( - elem_id=node_id, - name=node.get('name', ""), - desc=node.get('summary', ""), - elem_type=dug_element_type - ) - element.add_concept(concept) - elements.append(element) + if node['id'].startswith(curie_filter): + element_attribute_args = {"elem_id": node_id, "elem_type": dug_element_type} + element_attribute_args.update({key: node.get(attribute_mapping[key], "") + for key in attribute_mapping + }) + element = DugElement( + **element_attribute_args + ) + element.add_concept(concept) + elements.append(element) return elements diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index 92f0f744..3c2e16d2 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -96,7 +96,9 @@ def build_element_extraction_parameters(self, source=None): { "output_dug_type": dug_type, "casting_config": { - "node_type": queries[dug_type]['node_type'] + "node_type": queries[dug_type]["node_type"], + "curie_prefix": queries[dug_type]["curie_prefix"], + "attribute_mapping": queries[dug_type]["attribute_mapping"] # CDE's are only ones # but if we had two biolink:Publication nodes we want to conditionally # cast to other output_dug_type, we could extend this config diff --git a/tests/unit/mocks/data/tranql_response.json b/tests/unit/mocks/data/tranql_response.json index ed1e5c0c..da556ddd 100644 --- a/tests/unit/mocks/data/tranql_response.json +++ b/tests/unit/mocks/data/tranql_response.json @@ -8,14 +8,14 @@ } }, "nodes": { + "publication": { + "category": "biolink:Publication" + }, "disease": { "category": "biolink:Disease", "id": [ "MONDO:0008187" ] - }, - "publication": { - "category": "biolink:Publication" } } }, @@ -25,11 +25,11 @@ "name": "panic disorder 1", "category": [ "biolink:Disease", - "biolink:Entity", + "biolink:DiseaseOrPhenotypicFeature", "biolink:ThingWithTaxon", "biolink:BiologicalEntity", - "biolink:NamedThing", - "biolink:DiseaseOrPhenotypicFeature" + "biolink:Entity", + "biolink:NamedThing" ], "attributes": [ { @@ -72,6 +72,30 @@ "value": "Filename: Photosensitivity_PAQ_CDE_v1.0.json; File path: Supplemental Questionnaires/Sensory/Photosensitivity Assessment Questionnaire (PAQ); Photosensitivity Assessment Questionnaire", "name": "summary" }, + { + "type": "NA", + "value": [ + "Supplemental Questionnaires", + "Sensory", + "Photosensitivity Assessment Questionnaire (PAQ)" + ], + "name": "cde_categories" + }, + { + "type": "NA", + "value": [ + "Supplemental Questionnaires", + "Adult/Pediatric", + "Acute/Chronic Pain", + "Photosensitivity Assessment Questionnaire (PAQ)" + ], + "name": "cde_category_extended" + }, + { + "type": "NA", + "value": "Supplemental Questionnaires", + "name": "cde_category" + }, { "type": "NA", "value": [ diff --git a/tests/unit/test_crawler.py b/tests/unit/test_crawler.py index 80ff05e6..be264b6e 100644 --- a/tests/unit/test_crawler.py +++ b/tests/unit/test_crawler.py @@ -86,6 +86,13 @@ def test_expand_to_dug_element(crawler): concept=concept, casting_config={"node_type": "biolink:Publication"}, dug_element_type="test-element", - tranql_source="test:graph" + tranql_source="test:graph", + curie_filter="HEALCDE:", + attribute_mapping={ + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category" + } ) assert len(new_elements) == len(TRANQL_ANSWERS) From e4b236cfcd8fb3bae1c17ae6c06895b335d0950a Mon Sep 17 00:00:00 2001 From: Yaphetkg Date: Thu, 31 Mar 2022 13:02:46 -0400 Subject: [PATCH 4/4] some refactoring --- src/dug/core/crawler.py | 12 ++++-------- tests/unit/test_crawler.py | 20 +++++++++++--------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index dcc25e33..3ae70574 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -82,15 +82,11 @@ def crawl(self): casting_config = element_extraction_config['casting_config'] tranql_source = element_extraction_config['tranql_source'] dug_element_type = element_extraction_config['output_dug_type'] - curie_filter = element_extraction_config['curie_prefix'] - attribute_mapping = element_extraction_config['attribute_mapping'] dug_elements_from_graph += self.expand_to_dug_element( concept=concept, casting_config=casting_config, dug_element_type=dug_element_type, - tranql_source=tranql_source, - curie_filter=curie_filter, - attribute_mapping=attribute_mapping + tranql_source=tranql_source ) # add new elements to parsed elements @@ -202,9 +198,7 @@ def expand_to_dug_element(self, concept, casting_config, dug_element_type, - tranql_source, - curie_filter, - attribute_mapping): + tranql_source): """ Given a concept look up the knowledge graph to construct dug elements out of kg results does concept -> target_node_type crawls and converts target_node_type to dug element of type `dug_element_type` @@ -212,6 +206,8 @@ def expand_to_dug_element(self, elements = [] # using node_type as the primary criteria for matching nodes to element type. target_node_type = casting_config["node_type"] + curie_filter = casting_config["curie_prefix"] + attribute_mapping = casting_config["attribute_mapping"] target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", "")) for ident_id, identifier in concept.identifiers.items(): diff --git a/tests/unit/test_crawler.py b/tests/unit/test_crawler.py index be264b6e..059a42f5 100644 --- a/tests/unit/test_crawler.py +++ b/tests/unit/test_crawler.py @@ -84,15 +84,17 @@ def test_expand_to_dug_element(crawler): concept.add_identifier(identifier) new_elements = crawler.expand_to_dug_element( concept=concept, - casting_config={"node_type": "biolink:Publication"}, + casting_config={ + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + "attribute_mapping": { + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category" + } + }, dug_element_type="test-element", - tranql_source="test:graph", - curie_filter="HEALCDE:", - attribute_mapping={ - "name": "name", - "desc": "summary", - "collection_name": "cde_category", - "collection_id": "cde_category" - } + tranql_source="test:graph" ) assert len(new_elements) == len(TRANQL_ANSWERS)