Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update nn desc #311

Merged
merged 11 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified data/bdc_dbgap_data_dicts.tar.gz
Binary file not shown.
Binary file removed data/redis/appendonly.aof
Binary file not shown.
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
elasticsearch[async]==7.16.3
fastapi==0.95.0
uvicorn
uvicorn==0.23.2
gunicorn
itsdangerous
Jinja2
Expand All @@ -19,5 +19,6 @@ six==1.16.0

# Click for command line arguments
# We use Click 7.0 because that's what one of the pinned packages above use.
click~=7.0
click
httpx>=0.24.1
bmt==1.1.0
8 changes: 5 additions & 3 deletions src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ class Config:

# Normalizer config that will be passed to annotate.Normalizer constructor
normalizer: dict = field(default_factory=lambda: {
"url": "https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&curie="
"url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
})

# Synonym service config that will be passed to annotate.SynonymHelper constructor
synonym_service: dict = field(default_factory=lambda: {
"url": "https://onto.renci.org/synonyms/"
"url": "https://name-resolution-sri.renci.org/reverse_lookup"
})

# Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor
Expand All @@ -59,7 +59,9 @@ class Config:
"disease": ["disease", "phenotypic_feature"],
"pheno": ["phenotypic_feature", "disease"],
"anat": ["disease", "anatomical_entity"],
"chem_to_disease": ["chemical_substance", "disease"],
"chem_to_disease": ["chemical_entity", "disease"],
"small_molecule_to_disease": ["small_molecule", "disease"],
"chemical_mixture_to_disease": ["chemical_mixture", "disease"],
"phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
})

Expand Down
73 changes: 26 additions & 47 deletions src/dug/core/annotate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
import logging
import os
import re
import urllib.parse
from typing import TypeVar, Generic, Union, List, Tuple, Optional
import bmt
import requests
from requests import Session

Expand Down Expand Up @@ -59,14 +61,12 @@ def __init__(
annotator: "Annotator",
normalizer: "Normalizer",
synonym_finder: "SynonymFinder",
ontology_helper: "OntologyHelper",
ontology_greenlist=[],
):
self.preprocessor = preprocessor
self.annotator = annotator
self.normalizer = normalizer
self.synonym_finder = synonym_finder
self.ontology_helper = ontology_helper
self.ontology_greenlist = ontology_greenlist
self.norm_fails_file = "norm_fails.txt"
self.anno_fails_file = "anno_fails.txt"
Expand Down Expand Up @@ -106,12 +106,6 @@ def annotate(self, text, http_session):
# Add synonyms to identifier
norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session)

# Get canonical label, name, and description from ontology metadata service
name, desc, ontology_type = self.ontology_helper.get_ontology_info(norm_id.id, http_session)
norm_id.label = name
norm_id.description = desc
norm_id.type = ontology_type

# Get pURL for ontology identifer for more info
norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id)
processed_identifiers.append(norm_id)
Expand Down Expand Up @@ -335,6 +329,7 @@ def handle_response(self, value, response: dict) -> List[Identifier]:

class Normalizer(ApiClient[Identifier, Identifier]):
def __init__(self, url):
self.bl_toolkit = bmt.Toolkit()
self.url = url

def normalize(self, identifier: Identifier, http_session: Session):
Expand Down Expand Up @@ -380,9 +375,13 @@ def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[
logger.debug(f"Preferred id: {preferred_id}")
identifier.id = preferred_id.get('identifier', '')
identifier.label = preferred_id.get('label', '')
identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers]
identifier.types = biolink_type

identifier.description = preferred_id.get('description', '')
identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers]
try:
identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name
except:
# converts biolink:SmallMolecule to small molecule
identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower()
return identifier


Expand All @@ -400,51 +399,31 @@ def get_synonyms(self, curie: str, http_session):
return self(curie, http_session)

def make_request(self, curie: str, http_session: Session):

# Get response from synonym service
url = f"{self.url}{urllib.parse.quote(curie)}"

# Get response from namelookup reverse lookup op
# example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)
url = f"{self.url}"
payload = {
'curies': [curie]
}
try:
response = http_session.get(url)
if response.status_code == 400:
logger.error(f"No synonyms returned for: `{curie}`. Validation error.")
return []
if response.status_code == 500:
logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}.")
return []
response = http_session.post(url, json=payload)
if str(response.status_code).startswith('4'):
logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}")
return {curie: []}
if str(response.status_code).startswith('5'):
logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}")
return {curie: []}
return response.json()
except json.decoder.JSONDecodeError as e:
logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}")
return []
return {curie: []}

def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]:
# List comprehension unpack all synonyms into a list
return [synonym['desc'] for synonym in raw_synonyms]
# Return curie synonyms
return raw_synonyms.get(curie, [])


class OntologyHelper(ApiClient[str, Tuple[str, str, str]]):
def __init__(self, url):
self.url = url

def make_request(self, curie: str, http_session: Session):
url = f"{self.url}{urllib.parse.quote(curie)}"
try:
response = http_session.get(url).json()
return response
except json.decoder.JSONDecodeError as e:
logger.error(f"No labels returned for: {curie}")
return {}

def handle_response(self, curie: str, response: dict) -> Tuple[str,str,str]:
# List comprehension for synonyms
name = response.get('label', '')
description = '' if not response.get('description', None) else response.get('description', '')
ontology_type = '' if not response.get('category', None) else response.get('category', '')[0]

return name, description, ontology_type

def get_ontology_info(self, curie, http_session):
return self(curie, http_session)


class BioLinkPURLerizer:
Expand Down
12 changes: 7 additions & 5 deletions src/dug/core/factory.py
YaphetKG marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
from requests_cache import CachedSession

import dug.core.tranql as tql
from dug.core.annotate import DugAnnotator, Annotator, Normalizer, OntologyHelper, Preprocessor, SynonymFinder, \
ConceptExpander
from dug.core.annotate import (DugAnnotator,
Annotator,
Normalizer,
Preprocessor,
SynonymFinder,
ConceptExpander)
from dug.config import Config as DugConfig, TRANQL_SOURCE
from dug.core.crawler import Crawler
from dug.core.parsers import Parser
Expand Down Expand Up @@ -53,14 +57,12 @@ def build_annotator(self) -> DugAnnotator:
annotator = Annotator(**self.config.annotator)
normalizer = Normalizer(**self.config.normalizer)
synonym_finder = SynonymFinder(**self.config.synonym_service)
ontology_helper = OntologyHelper(**self.config.ontology_helper)

annotator = DugAnnotator(
preprocessor=preprocessor,
annotator=annotator,
normalizer=normalizer,
synonym_finder=synonym_finder,
ontology_helper=ontology_helper
synonym_finder=synonym_finder
)

return annotator
Expand Down
16 changes: 13 additions & 3 deletions src/dug/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __init__(self, cfg: Config, indices=None):

self.es = Elasticsearch(hosts=self.hosts,
http_auth=(self._cfg.elastic_username, self._cfg.elastic_password))
self.replicas = self.get_es_node_count()

if self.es.ping():
logger.info('connected to elasticsearch')
Expand All @@ -36,6 +37,10 @@ def __init__(self, cfg: Config, indices=None):
raise SearchException(
message='failed to connect to elasticsearch',
details=f"connecting to host {self._cfg.elastic_host} and port {self._cfg.elastic_port}")

def get_es_node_count(self):
return self.es.nodes.info()["_nodes"]["total"]


def init_indices(self):
# The concepts and variable indices include an analyzer that utilizes the english
Expand All @@ -49,7 +54,7 @@ def init_indices(self):
kg_index = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
"number_of_replicas": self.replicas
},
"mappings": {
"properties": {
Expand All @@ -66,7 +71,7 @@ def init_indices(self):
"settings": {
"index.mapping.coerce": "false",
"number_of_shards": 1,
"number_of_replicas": 0,
"number_of_replicas": self.replicas,
"analysis": {
"analyzer": {
"std_with_stopwords": {
Expand Down Expand Up @@ -104,7 +109,7 @@ def init_indices(self):
"settings": {
"index.mapping.coerce": "false",
"number_of_shards": 1,
"number_of_replicas": 0,
"number_of_replicas": self.replicas,
"analysis": {
"analyzer": {
"std_with_stopwords": {
Expand Down Expand Up @@ -148,6 +153,11 @@ def init_indices(self):
for index in self.indices:
try:
if self.es.indices.exists(index=index):
# if index exists check if replication is good
index_replicas = self.es.indices.get_settings(index=index)[index]["settings"]["index"]["number_of_replicas"]
if index_replicas != self.replicas:
self.es.indices.put_settings(index=index, body={"number_of_replicas": (self.replicas - 1) or 1 })
self.es.indices.refresh(index=index)
logger.info(f"Ignoring index {index} which already exists.")
else:
result = self.es.indices.create(
Expand Down
41 changes: 19 additions & 22 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ def get(self, url, params: dict = None):
if text is None:
return MockResponse(text="{}", status_code=404)
return MockResponse(text, status_code=status_code)

def post(self, url, params: dict = None, json: dict = {}):
if params:
qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
url = f"{url}?{qstr}"
text, status_code = self.urls.get(url)

if text is None:
return MockResponse(text="{}", status_code=404)
return MockResponse(text, status_code=status_code)


@pytest.fixture
Expand Down Expand Up @@ -208,29 +218,16 @@ def _(curie):


@pytest.fixture
def synonym_api():
base_url = "http://synonyms.api/?curie={curie}"

def _(curie):
return base_url.format(
curie=urllib.parse.quote(curie),
)
def synonym_api():
return MockApiService(urls={
_("UBERON:0007100"): [json.dumps([
{
"desc": "adult heart",
"scope": "RELATED",
"syn_type": None,
"xref": ""
}
]), 200],
_("MONDO"): [json.dumps({
"validation error": "format should be <PREFIX>:<XXX>"
}), 400],
_("UNSUPPORTED_PREFIX:XXX"): [json.dumps({
"validation error": "UNSUPPORTED_PREFIX is not supported"
}), 400],

"http://synonyms.api": [json.dumps({
"UBERON:0007100": [
"primary circulatory organ",
"dorsal tube",
"adult heart",
"heart"
]
}), 200]
})


Expand Down
38 changes: 10 additions & 28 deletions tests/unit/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from dug.config import Config
from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder, OntologyHelper
from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder


def test_identifier():
Expand Down Expand Up @@ -195,45 +195,27 @@ def test_normalizer(normalizer_api):
assert output.id == 'UBERON:0007100'
assert output.label == "primary circulatory organ"
assert output.equivalent_identifiers == ['UBERON:0007100']
assert output.types == [
'biolink:AnatomicalEntity', 'biolink:OrganismalEntity', 'biolink:BiologicalEntity',
'biolink:NamedThing', 'biolink:Entity'
]
assert output.types == 'anatomical entity'



def test_synonym_finder(synonym_api):
curie = "UBERON:0007100"
url = f"http://synonyms.api/?curie="

url = f"http://synonyms.api"
finder = SynonymFinder(url)
result = finder.get_synonyms(
curie,
synonym_api,
)
assert result == ["adult heart"]
curie = "MONDO"
result = finder.get_synonyms(
curie,
synonym_api,
)
assert result == []
curie = "UNSUPPORTED_PREFIX:XXX"
result = finder.get_synonyms(
curie,
synonym_api,
)
assert result == []
assert result == [
"primary circulatory organ",
"dorsal tube",
"adult heart",
"heart"
]


def test_ontology_helper(ontology_api):
curie = "UBERON:0007100"
url = "http://ontology.api/?curie="

helper = OntologyHelper(url)
name, description, ontology_type = helper.get_ontology_info(curie, ontology_api)
assert name == 'primary circulatory organ'
assert description == 'A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].'
assert ontology_type == 'anatomical entity'


def test_yield_partial_text():
Expand Down
Loading