helxplatform · YaphetKG · Aug 21, 2023 · Aug 14, 2023 · Aug 14, 2023 · Aug 15, 2023
diff --git a/data/bdc_dbgap_data_dicts.tar.gz b/data/bdc_dbgap_data_dicts.tar.gz
diff --git a/data/redis/appendonly.aof b/data/redis/appendonly.aof
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 elasticsearch[async]==7.16.3
 fastapi==0.95.0
-uvicorn
+uvicorn==0.23.2
 gunicorn
 itsdangerous
 Jinja2
@@ -19,5 +19,6 @@ six==1.16.0
 
 # Click for command line arguments
 # We use Click 7.0 because that's what one of the pinned packages above use.
-click~=7.0
+click
 httpx>=0.24.1
+bmt==1.1.0
diff --git a/src/dug/config.py b/src/dug/config.py
@@ -39,12 +39,12 @@ class Config:
 
     # Normalizer config that will be passed to annotate.Normalizer constructor
     normalizer: dict = field(default_factory=lambda: {
-        "url": "https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&curie="
+        "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
     })
 
     # Synonym service config that will be passed to annotate.SynonymHelper constructor
     synonym_service: dict = field(default_factory=lambda: {
-        "url": "https://onto.renci.org/synonyms/"
+        "url": "https://name-resolution-sri.renci.org/reverse_lookup"
     })
 
     # Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor
@@ -59,7 +59,9 @@ class Config:
         "disease": ["disease", "phenotypic_feature"],
         "pheno": ["phenotypic_feature", "disease"],
         "anat": ["disease", "anatomical_entity"],
-        "chem_to_disease": ["chemical_substance", "disease"],
+        "chem_to_disease": ["chemical_entity", "disease"],
+        "small_molecule_to_disease": ["small_molecule", "disease"],
+        "chemical_mixture_to_disease": ["chemical_mixture", "disease"],
         "phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
     })
 

diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py
@@ -1,8 +1,10 @@
 import json
 import logging
 import os
+import re
 import urllib.parse
 from typing import TypeVar, Generic, Union, List, Tuple, Optional
+import bmt
 import requests
 from requests import Session
 
@@ -59,14 +61,12 @@ def __init__(
             annotator: "Annotator",
             normalizer: "Normalizer",
             synonym_finder: "SynonymFinder",
-            ontology_helper: "OntologyHelper",
             ontology_greenlist=[],
     ):
         self.preprocessor = preprocessor
         self.annotator = annotator
         self.normalizer = normalizer
         self.synonym_finder = synonym_finder
-        self.ontology_helper = ontology_helper
         self.ontology_greenlist = ontology_greenlist
         self.norm_fails_file = "norm_fails.txt"
         self.anno_fails_file = "anno_fails.txt"
@@ -106,12 +106,6 @@ def annotate(self, text, http_session):
             # Add synonyms to identifier
             norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session)
 
-            # Get canonical label, name, and description from ontology metadata service
-            name, desc, ontology_type = self.ontology_helper.get_ontology_info(norm_id.id, http_session)
-            norm_id.label = name
-            norm_id.description = desc
-            norm_id.type = ontology_type
-
             # Get pURL for ontology identifer for more info
             norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id)
             processed_identifiers.append(norm_id)
@@ -335,6 +329,7 @@ def handle_response(self, value, response: dict) -> List[Identifier]:
 
 class Normalizer(ApiClient[Identifier, Identifier]):
     def __init__(self, url):
+        self.bl_toolkit = bmt.Toolkit()
         self.url = url
 
     def normalize(self, identifier: Identifier, http_session: Session):
@@ -380,9 +375,13 @@ def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[
         logger.debug(f"Preferred id: {preferred_id}")
         identifier.id = preferred_id.get('identifier', '')
         identifier.label = preferred_id.get('label', '')
-        identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers]
-        identifier.types = biolink_type
-
+        identifier.description = preferred_id.get('description', '')
+        identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers]        
+        try: 
+            identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name
+        except:
+            # converts biolink:SmallMolecule to small molecule 
+            identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower()
         return identifier
 
 
@@ -400,51 +399,31 @@ def get_synonyms(self, curie: str, http_session):
         return self(curie, http_session)
 
     def make_request(self, curie: str, http_session: Session):
-
-        # Get response from synonym service
-        url = f"{self.url}{urllib.parse.quote(curie)}"
-
+        # Get response from namelookup reverse lookup op
+        # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)
+        url = f"{self.url}"
+        payload = {
+            'curies': [curie]
+        }
         try:
-            response = http_session.get(url)
-            if response.status_code == 400:
-                logger.error(f"No synonyms returned for: `{curie}`. Validation error.")
-                return []
-            if response.status_code == 500:
-                logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}.")
-                return []
+            response = http_session.post(url, json=payload)
+            if str(response.status_code).startswith('4'):
+                logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}")
+                return {curie: []}
+            if str(response.status_code).startswith('5'):
+                logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}")
+                return {curie: []}
             return response.json()
         except json.decoder.JSONDecodeError as e:
             logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}")
-            return []
+            return {curie: []}
 
     def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]:
-        # List comprehension unpack all synonyms into a list
-        return [synonym['desc'] for synonym in raw_synonyms]
+        # Return curie synonyms
+        return raw_synonyms.get(curie, [])
 
 
-class OntologyHelper(ApiClient[str, Tuple[str, str, str]]):
-    def __init__(self, url):
-        self.url = url
 
-    def make_request(self, curie: str, http_session: Session):
-        url = f"{self.url}{urllib.parse.quote(curie)}"
-        try:
-            response = http_session.get(url).json()
-            return response
-        except json.decoder.JSONDecodeError as e:
-            logger.error(f"No labels returned for: {curie}")
-            return {}
-
-    def handle_response(self, curie: str, response: dict) -> Tuple[str,str,str]:
-        # List comprehension for synonyms
-        name = response.get('label', '')
-        description = '' if not response.get('description', None) else response.get('description', '')
-        ontology_type = '' if not response.get('category', None) else response.get('category', '')[0]
-
-        return name, description, ontology_type
-
-    def get_ontology_info(self, curie, http_session):
-        return self(curie, http_session)
 
 
 class BioLinkPURLerizer:

diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py
@@ -4,8 +4,12 @@
 from requests_cache import CachedSession
 
 import dug.core.tranql as tql
-from dug.core.annotate import DugAnnotator, Annotator, Normalizer, OntologyHelper, Preprocessor, SynonymFinder, \
-    ConceptExpander
+from dug.core.annotate import (DugAnnotator, 
+                               Annotator, 
+                               Normalizer, 
+                               Preprocessor, 
+                               SynonymFinder,
+                               ConceptExpander)
 from dug.config import Config as DugConfig, TRANQL_SOURCE
 from dug.core.crawler import Crawler
 from dug.core.parsers import Parser
@@ -53,14 +57,12 @@ def build_annotator(self) -> DugAnnotator:
         annotator = Annotator(**self.config.annotator)
         normalizer = Normalizer(**self.config.normalizer)
         synonym_finder = SynonymFinder(**self.config.synonym_service)
-        ontology_helper = OntologyHelper(**self.config.ontology_helper)
 
         annotator = DugAnnotator(
             preprocessor=preprocessor,
             annotator=annotator,
             normalizer=normalizer,
-            synonym_finder=synonym_finder,
-            ontology_helper=ontology_helper
+            synonym_finder=synonym_finder
         )
 
         return annotator

diff --git a/src/dug/core/index.py b/src/dug/core/index.py
@@ -26,6 +26,7 @@ def __init__(self, cfg: Config, indices=None):
 
         self.es = Elasticsearch(hosts=self.hosts,
                                 http_auth=(self._cfg.elastic_username, self._cfg.elastic_password))
+        self.replicas = self.get_es_node_count()
 
         if self.es.ping():
             logger.info('connected to elasticsearch')
@@ -36,6 +37,10 @@ def __init__(self, cfg: Config, indices=None):
             raise SearchException(
                 message='failed to connect to elasticsearch',
                 details=f"connecting to host {self._cfg.elastic_host} and port {self._cfg.elastic_port}")
+
+    def get_es_node_count(self):
+        return self.es.nodes.info()["_nodes"]["total"]
+
 
     def init_indices(self):
         # The concepts and variable indices include an analyzer that utilizes the english
@@ -49,7 +54,7 @@ def init_indices(self):
         kg_index = {
             "settings": {
                 "number_of_shards": 1,
-                "number_of_replicas": 0
+                "number_of_replicas": self.replicas
             },
             "mappings": {
                 "properties": {
@@ -66,7 +71,7 @@ def init_indices(self):
             "settings": {
                 "index.mapping.coerce": "false",
                 "number_of_shards": 1,
-                "number_of_replicas": 0,
+                "number_of_replicas": self.replicas,
                 "analysis": {
                     "analyzer": {
                         "std_with_stopwords": {
@@ -104,7 +109,7 @@ def init_indices(self):
             "settings": {
                 "index.mapping.coerce": "false",
                 "number_of_shards": 1,
-                "number_of_replicas": 0,
+                "number_of_replicas": self.replicas,
                 "analysis": {
                     "analyzer": {
                         "std_with_stopwords": {
@@ -148,6 +153,11 @@ def init_indices(self):
         for index in self.indices:
             try:
                 if self.es.indices.exists(index=index):
+                    # if index exists check if replication is good 
+                    index_replicas = self.es.indices.get_settings(index=index)[index]["settings"]["index"]["number_of_replicas"]
+                    if index_replicas != self.replicas:
+                        self.es.indices.put_settings(index=index, body={"number_of_replicas": (self.replicas - 1) or 1 })
+                        self.es.indices.refresh(index=index)
                     logger.info(f"Ignoring index {index} which already exists.")
                 else:
                     result = self.es.indices.create(

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -29,6 +29,16 @@ def get(self, url, params: dict = None):
         if text is None:
             return MockResponse(text="{}", status_code=404)
         return MockResponse(text, status_code=status_code)
+
+    def post(self, url, params: dict = None, json: dict = {}):
+        if params:
+            qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
+            url = f"{url}?{qstr}"
+        text, status_code = self.urls.get(url)
+
+        if text is None:
+            return MockResponse(text="{}", status_code=404)
+        return MockResponse(text, status_code=status_code)
 
 
 @pytest.fixture
@@ -208,29 +218,16 @@ def _(curie):
 
 
 @pytest.fixture
-def synonym_api():
-    base_url = "http://synonyms.api/?curie={curie}"
-
-    def _(curie):
-        return base_url.format(
-            curie=urllib.parse.quote(curie),
-        )
+def synonym_api():    
     return MockApiService(urls={
-        _("UBERON:0007100"): [json.dumps([
-            {
-                "desc": "adult heart",
-                "scope": "RELATED",
-                "syn_type": None,
-                "xref": ""
-            }
-        ]), 200],
-        _("MONDO"): [json.dumps({
-            "validation error": "format should be <PREFIX>:<XXX>"
-        }), 400],
-        _("UNSUPPORTED_PREFIX:XXX"): [json.dumps({
-            "validation error": "UNSUPPORTED_PREFIX is not supported"
-        }), 400],
-
+        "http://synonyms.api": [json.dumps({
+            "UBERON:0007100": [
+                "primary circulatory organ",
+                "dorsal tube",
+                "adult heart",
+                "heart"
+            ]
+        }), 200]
     })
 
 

diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py
@@ -4,7 +4,7 @@
 import pytest
 
 from dug.config import Config
-from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder, OntologyHelper
+from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder
 
 
 def test_identifier():
@@ -195,45 +195,27 @@ def test_normalizer(normalizer_api):
     assert output.id == 'UBERON:0007100'
     assert output.label == "primary circulatory organ"
     assert output.equivalent_identifiers == ['UBERON:0007100']
-    assert output.types == [
-        'biolink:AnatomicalEntity', 'biolink:OrganismalEntity', 'biolink:BiologicalEntity',
-        'biolink:NamedThing', 'biolink:Entity'
-    ]
+    assert output.types == 'anatomical entity'
+
 
 
 def test_synonym_finder(synonym_api):
     curie = "UBERON:0007100"
-    url = f"http://synonyms.api/?curie="
-
+    url = f"http://synonyms.api"
     finder = SynonymFinder(url)
     result = finder.get_synonyms(
         curie,
         synonym_api,
     )
-    assert result == ["adult heart"]
-    curie = "MONDO"
-    result = finder.get_synonyms(
-        curie,
-        synonym_api,
-    )
-    assert result == []
-    curie = "UNSUPPORTED_PREFIX:XXX"
-    result = finder.get_synonyms(
-        curie,
-        synonym_api,
-    )
-    assert result == []
+    assert result == [
+            "primary circulatory organ",
+            "dorsal tube",
+            "adult heart",
+            "heart"
+        ]
 
 
-def test_ontology_helper(ontology_api):
-    curie = "UBERON:0007100"
-    url = "http://ontology.api/?curie="
 
-    helper = OntologyHelper(url)
-    name, description, ontology_type = helper.get_ontology_info(curie, ontology_api)
-    assert name == 'primary circulatory organ'
-    assert description == 'A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].'
-    assert ontology_type == 'anatomical entity'
 
 
 def test_yield_partial_text():