Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/2.8.0 #198

Merged
merged 5 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/dug/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.7.0"
__version__ = "2.8.rc0"
11 changes: 11 additions & 0 deletions src/dug/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ def get_argparser():
default=None
)

crawl_parser.add_argument(
"-x", "--extract-from-graph",
help="[Optional] Extract dug elements for tranql using concepts from annotation",
dest="extract_dug_elements",
default=False,
action="store_true"
)

# Search subcommand
search_parser = subparsers.add_parser('search', help='Apply semantic search')
search_parser.set_defaults(func=search)
Expand Down Expand Up @@ -95,6 +103,9 @@ def get_argparser():

def crawl(args):
config = Config.from_env()
if not args.extract_dug_elements:
# disable extraction
config.node_to_element_queries = {}
factory = DugFactory(config)
dug = Dug(factory)
dug.crawl(args.target, args.parser_type, args.element_type)
Expand Down
8 changes: 8 additions & 0 deletions src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ class Config:
"phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
})

node_to_element_queries: dict = field(default_factory=lambda: {
# Dug element type to cast the query kg nodes to
"cde": {
# Parse nodes matching criteria in kg
"node_type": "biolink:Publication"
}
})

concept_expander: dict = field(default_factory=lambda: {
"url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false",
"min_tranql_score": 0.0
Expand Down
8 changes: 5 additions & 3 deletions src/dug/core/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __init__(self, url, min_tranql_score=0.2):
def is_acceptable_answer(self, answer):
return True

def expand_identifier(self, identifier, query_factory, kg_filename):
def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False):

answer_kgs = []

Expand Down Expand Up @@ -182,9 +182,11 @@ def expand_identifier(self, identifier, query_factory, kg_filename):
# Temporarily surround in try/except because sometimes the answer graphs
# contain invalid references to edges/nodes
# This will be fixed in Robokop but for now just silently warn if answer is invalid
node_attributes_filter = None if include_all_attributes else self.include_node_keys
edge_attributes_filter = None if include_all_attributes else self.include_edge_keys
answer_kg = kg.get_answer_subgraph(answer,
include_node_keys=self.include_node_keys,
include_edge_keys=self.include_edge_keys)
include_node_keys=node_attributes_filter,
include_edge_keys=edge_attributes_filter)

# Add subgraph to list of acceptable answers to query
answer_kgs.append(answer_kg)
Expand Down
89 changes: 86 additions & 3 deletions src/dug/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import traceback

from dug.core.parsers import Parser, DugElement, DugConcept
import dug.core.tranql as tql
from dug.utils import biolink_snake_case

logger = logging.getLogger('dug')


class Crawler:
def __init__(self, crawl_file: str, parser: Parser, annotator,
tranqlizer, tranql_queries,
http_session, exclude_identifiers=None, element_type=None):
http_session, exclude_identifiers=None, element_type=None,
element_extraction=None):

if exclude_identifiers is None:
exclude_identifiers = []
Expand All @@ -24,6 +27,7 @@ def __init__(self, crawl_file: str, parser: Parser, annotator,
self.tranql_queries = tranql_queries
self.http_session = http_session
self.exclude_identifiers = exclude_identifiers
self.element_extraction = element_extraction
self.elements = []
self.concepts = {}
self.crawlspace = "crawl"
Expand Down Expand Up @@ -52,7 +56,10 @@ def crawl(self):
# Annotate elements
self.annotate_elements()

# Expand concepts
# if elements are extracted from the graph this array will contain the new dug elements
dug_elements_from_graph = []

# Expand concepts to other concepts
concept_file = open(f"{self.crawlspace}/concept_file.json", "w")
for concept_id, concept in self.concepts.items():
# Use TranQL queries to fetch knowledge graphs containing related but not synonymous biological terms
Expand All @@ -70,6 +77,21 @@ def crawl(self):
# Write concept out to a file
concept_file.write(f"{json.dumps(concept.get_searchable_dict(), indent=2)}")

if self.element_extraction:
for element_extraction_config in self.element_extraction:
casting_config = element_extraction_config['casting_config']
tranql_source = element_extraction_config['tranql_source']
dug_element_type = element_extraction_config['output_dug_type']
dug_elements_from_graph += self.expand_to_dug_element(
concept=concept,
casting_config=casting_config,
dug_element_type=dug_element_type,
tranql_source=tranql_source
)

# add new elements to parsed elements
self.elements += dug_elements_from_graph

# Set element optional terms now that concepts have been expanded
# Open variable file for writing
variable_file = open(f"{self.crawlspace}/element_file.json", "w")
Expand Down Expand Up @@ -117,7 +139,6 @@ def annotate_elements(self):
for concept_to_add in concepts_to_add:
element.add_concept(concept_to_add)


def annotate_element(self, element):

# Annotate with a set of normalized ontology identifiers
Expand Down Expand Up @@ -172,3 +193,65 @@ def expand_concept(self, concept):
# Add any answer knowledge graphs to
for answer in answers:
concept.add_kg_answer(answer, query_name=query_name)

def expand_to_dug_element(self,
concept,
casting_config,
dug_element_type,
tranql_source):
"""
Given a concept look up the knowledge graph to construct dug elements out of kg results
does concept -> target_node_type crawls and converts target_node_type to dug element of type `dug_element_type`
"""
elements = []
# using node_type as the primary criteria for matching nodes to element type.
target_node_type = casting_config["node_type"]
target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", ""))
for ident_id, identifier in concept.identifiers.items():

# Check to see if the concept identifier has types defined, this is used to create
# tranql queries below.
if not identifier.types:
continue

# convert the first type to snake case to be used in tranql query.
# first type is the leaf type, this is coming from Node normalization.
node_type = biolink_snake_case(identifier.types[0].replace("biolink:", ""))
try:
# Tranql query factory currently supports select node types as valid query
# Types missing from QueryFactory.data_types will be skipped with this try catch
query = tql.QueryFactory([node_type, target_node_type_snake_case], tranql_source)
except tql.InvalidQueryError as exception:
logger.debug(f"Skipping {ident_id}, {exception}")
continue

# check if tranql query object can use the curie.
if query.is_valid_curie(ident_id):
logger.info(f"Expanding {ident_id} to other dug elements")
# Fetch kg and answer
# Fetch kg and answer
# replace ":" with "~" to avoid windows os errors
kg_outfile = f"{self.crawlspace}/" + f"{ident_id}_{target_node_type}.json".replace(":", "~")

# query tranql, answers will include all node and edge attributes
answers = self.tranqlizer.expand_identifier(ident_id, query,
kg_filename=kg_outfile,
include_all_attributes=True)

# for each answer construct a dug element
for answer in answers:
# here we will inspect the answers create new dug elements based on target node type
# and return the variables.
for node_id, node in answer.nodes.items():
if target_node_type in node["category"]:
# @TODO make element creation more generic
# @TODO need to encode more data into the graph nodes, to parse them properly
element = DugElement(
elem_id=node_id,
name=node.get('name', ""),
desc=node.get('summary', ""),
elem_type=dug_element_type
)
element.add_concept(concept)
elements.append(element)
return elements
20 changes: 19 additions & 1 deletion src/dug/core/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def build_crawler(self, target, parser: Parser, element_type: str, tranql_source
tranql_queries=self.build_tranql_queries(tranql_source),
http_session=self.build_http_session(),
exclude_identifiers=self.config.tranql_exclude_identifiers,
element_type=element_type
element_type=element_type,
element_extraction=self.build_element_extraction_parameters(),
)

return crawler
Expand Down Expand Up @@ -78,3 +79,20 @@ def build_tranql_queries(self, source=None) -> Dict[str, tql.QueryFactory]:

def build_search_obj(self, indices) -> Search:
return Search(self.config, indices=indices)

def build_element_extraction_parameters(self, source=None):
if source is None:
source = TRANQL_SOURCE
queries = self.config.node_to_element_queries
return [
{
"output_dug_type": dug_type,
"casting_config": {
"node_type": queries[dug_type]['node_type']
# CDE's are only ones
# but if we had two biolink:Publication nodes we want to conditionally
# cast to other output_dug_type, we could extend this config
},
"tranql_source": source
} for dug_type in queries
]
25 changes: 4 additions & 21 deletions src/dug/core/tranql.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json, re
import json
from dug.utils import biolink_snake_case


class MissingNodeReferenceError(BaseException):
Expand Down Expand Up @@ -179,25 +180,7 @@ def get_kg(self):
return old_kg_model

def _snake_case(self, arg: str):
"""Convert string to snake_case.
Non-alphanumeric characters are replaced with _.
CamelCase is replaced with snake_case.
"""
# replace non-alphanumeric characters with _
tmp = re.sub(r'\W', '_', arg)
# replace X with _x
tmp = re.sub(
r'(?<=[a-z])[A-Z](?=[a-z])',
lambda c: '_' + c.group(0).lower(),
tmp
)
# lower-case first character
tmp = re.sub(
r'^[A-Z](?=[a-z])',
lambda c: c.group(0).lower(),
tmp
)
return tmp
return biolink_snake_case(arg)


class InvalidQueryError(BaseException):
Expand All @@ -207,7 +190,7 @@ class InvalidQueryError(BaseException):
class QueryFactory:

# Class member list of valid data types that can be included in query
data_types = ["phenotypic_feature", "gene", "disease", "chemical_substance",
data_types = ["publication", "phenotypic_feature", "gene", "disease", "chemical_substance",
"drug_exposure", "biological_process", "anatomical_entity", "small_molecule",
"chemical_mixture", "chemical_entity"]

Expand Down
24 changes: 24 additions & 0 deletions src/dug/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

class ObjectFactory:
def __init__(self):
self._builders = {}
Expand Down Expand Up @@ -34,3 +36,25 @@ def get_dbgap_study_link(study_id):
def get_nida_study_link(study_id):
base_url = "https://datashare.nida.nih.gov/study"
return f'{base_url}/{study_id}'


def biolink_snake_case(arg):
"""Convert such SnakeCase to snake_case.
Non-alphanumeric characters are replaced with _.
CamelCase is replaced with snake_case.
"""
# replace non-alphanumeric characters with _
tmp = re.sub(r'\W', '_', arg)
# replace X with _x
tmp = re.sub(
r'(?<=[a-z])[A-Z](?=[a-z])',
lambda c: '_' + c.group(0).lower(),
tmp
)
# lower-case first character
tmp = re.sub(
r'^[A-Z](?=[a-z])',
lambda c: c.group(0).lower(),
tmp
)
return tmp
63 changes: 63 additions & 0 deletions tests/unit/mocks/MockCrawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from unittest.mock import MagicMock, Mock

import pytest
import os
import json


from dug.core.annotate import Identifier
from dug.core.tranql import QueryFactory, QueryKG

# Makes some simple mokes
ParserMock = MagicMock()
HTTPSessionMock = MagicMock()

# mocking tranql queries
TranqlQueriesMock = {}
for key, query in {
"disease": ["disease", "phenotypic_feature"],
"pheno": ["phenotypic_feature", "disease"]
}.items():
TranqlQueriesMock[key] = QueryFactory(query, source="test")


# for testing no id exclusion
ExcludedIDs = []

ANNOTATED_IDS = [
Identifier("MONDO:0", "0", ["disease"]),
Identifier("PUBCHEM.COMPOUND:1", "1", ["chemical"])
]
for ids in ANNOTATED_IDS:
ids.type = ids.types[0]
# annotator with annotate method returning mocked concepts
AnnotatorMock = MagicMock()
AnnotatorMock.annotate = Mock(return_value=ANNOTATED_IDS)

# tranqlizer returning mock kg when expanding concepts
TranqlizerMock = MagicMock()

# Get example tranql answer
with open(os.path.join(os.path.dirname(__file__), "data", "tranql_response.json")) as stream:
tranql_json = json.load(stream)
kg_answer = QueryKG(kg_json=tranql_json)
TRANQL_ANSWERS = []
for answer in kg_answer.answers:
TRANQL_ANSWERS.append(kg_answer.get_answer_subgraph(answer))

TranqlizerMock.expand_identifier = Mock(return_value=TRANQL_ANSWERS)

#mock a crawler with mock dependencies
@pytest.fixture
def crawler_init_args_no_graph_extraction():
return {
"crawl_file": "test",
"parser": ParserMock,
"annotator": AnnotatorMock,
"tranqlizer": TranqlizerMock,
"tranql_queries": TranqlQueriesMock,
"http_session": HTTPSessionMock,
"exclude_identifiers": ExcludedIDs,
"element_type": "TestElement",
"element_extraction": None
}
Empty file added tests/unit/mocks/__init__.py
Empty file.
Loading