helxplatform · YaphetKG · Jun 28, 2024 · May 13, 2024 · May 27, 2024 · May 27, 2024
diff --git a/src/dug/config.py b/src/dug/config.py
@@ -47,7 +47,7 @@ class Config:
             },
             "sapbert": {
                 "classification_url": "https://med-nemo.apps.renci.org/annotate/",
-                "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
+                "annotator_url": "https://sap-qdrant.apps.renci.org/annotate/",
             },
         }
     )

diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py
@@ -7,6 +7,7 @@
 from dug import utils as utils
 from requests import Session
 import bmt
+from retrying import retry
 
 logger = logging.getLogger("dug")
 
@@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session):
         result = self.handle_response(curie, response)
         return result
 
+    @retry(stop_max_attempt_number=3)
     def make_request(self, curie: str, http_session: Session):
         # Get response from namelookup reverse lookup op
         # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)

diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py
@@ -1,8 +1,7 @@
 import logging
 from typing import List
 from requests import Session
-import json
-
+from retrying import retry
 from dug.core.annotators._base import DugIdentifier, Input
 from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer
 
@@ -26,6 +25,7 @@ def __init__(
     ):
         self.classificationUrl = kwargs.get('classification_url')
         self.annotatorUrl = kwargs.get('annotator_url')
+
         if not self.classificationUrl:
             raise TypeError('Classification url needs to be defined for sapbert annotator')
         if not self.annotatorUrl:
@@ -35,7 +35,12 @@ def __init__(
         self.ontology_greenlist = ontology_greenlist
         self.norm_fails_file = "norm_fails.txt"
         self.anno_fails_file = "anno_fails.txt"
+        # threshold marking cutoff point
+        self.score_threshold = kwargs.get("score_threshold", 0.8)
+        # indicate if we want values above or below the threshold.
+        self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False
 
+    @retry(stop_max_attempt_number=3)
     def __call__(self, text, http_session) -> List[DugIdentifier]:
         # Fetch identifiers
         classifiers: List = self.text_classification(text, http_session)
@@ -132,7 +137,7 @@ def handle_classification_response(self, response: dict) -> List:
             text = denotation.get("text", None)
             bl_type = denotation.get("obj", None)
             classifiers.append(
-                {"text": text, "bl_type": bl_type.replace("biolink:", "")}
+                {"text": text, "bl_type": bl_type}
             )
         return classifiers
 
@@ -184,7 +189,7 @@ def make_annotation_request(self, term_dict: Input, http_session: Session):
         payload = {
             "text": term_dict["text"],
             "model_name": "sapbert",
-            "count": 1000,
+            "count": 10,
             "args": {"bl_type": term_dict["bl_type"]},
         }
         # This could be moved to a config file
@@ -213,36 +218,14 @@ def handle_annotation_response(self, value, response: dict) -> List[DugIdentifie
                 continue
 
             biolink_type = identifier.get('category')
-            score = identifier.get("score", None)
+            score = identifier.get("score", 0)
             label = identifier.get("name")
-            identifiers.append(
-                DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
-            )
+            if score >= self.score_threshold and self.score_direction_up:
+                identifiers.append(
+                    DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
+                )
+            elif score <= self.score_threshold and not self.score_direction_up:
+                identifiers.append(
+                    DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
+                )
         return identifiers
-
-## Testing Purposes
-# if __name__ == "__main__":
-#     from dug.config import Config
-#     import json
-#     import redis
-#     from requests_cache import CachedSession
-#     from dug.core.annotators._base import DefaultNormalizer, DefaultSynonymFinder
-
-#     config = Config.from_env()
-#     annotator = AnnotateSapbert(
-#         normalizer=DefaultNormalizer(**config.normalizer),
-#         synonym_finder=DefaultSynonymFinder(**config.synonym_service),
-#     )
-
-#     redis_config = {
-#         "host": "localhost",
-#         "port": config.redis_port,
-#         "password": config.redis_password,
-#     }
-
-#     http_sesh = CachedSession(
-#         cache_name="annotator",
-#         backend="redis",
-#         connection=redis.StrictRedis(**redis_config),
-#     )
-#     annotator(text="Have you ever had a heart attack?", http_session=http_sesh)
diff --git a/src/dug/core/concept_expander.py b/src/dug/core/concept_expander.py
@@ -31,7 +31,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_
             with open(kg_filename, 'r') as stream:
                 response = json.load(stream)
         else:
-            query = query_factory._get_var_query(identifier)
+            query = query_factory.get_query(identifier)
             logger.debug(query)
             response = requests.post(
                 url=self.url,

diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py
@@ -4,7 +4,7 @@
 import pluggy
 
 from ._base import DugElement, DugConcept, Indexable, Parser, FileParser
-from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser
+from .dbgap_parser import *
 from .nida_parser import NIDAParser
 from .scicrunch_parser import SciCrunchParser
 from .topmed_tag_parser import TOPMedTagParser
@@ -36,8 +36,19 @@ def define_parsers(parser_dict: Dict[str, Parser]):
     parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
     parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs")
     parser_dict["ctn"] = CTNParser()
+    parser_dict["biolincc"] = BioLINCCDbGaPParser()
+    parser_dict["covid19"] = Covid19DbGaPParser()
+    parser_dict["dir"] = DIRDbGaPParser()
+    parser_dict["lungmap"] = LungMAPDbGaPParser()
+    parser_dict["nsrr"] = NSRRDbGaPParser()
+    parser_dict["parent"] = ParentDBGaPParser()
+    parser_dict["pcgc"] = PCGCDbGaPParser()
+    parser_dict["recover"] = RECOVERDBGaPParser()
+    parser_dict["topmeddbgap"] = TopmedDBGaPParser()
+    parser_dict["curesc"] = CureSC()
     parser_dict["radx"] = RADxParser()
 
+
 
 
 

diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py
@@ -92,7 +92,57 @@ class CRDCDbGaPParser(DbGaPParser):
     def _get_element_type(self):
         return "Cancer Data Commons"
 
+
 class KFDRCDbGaPParser(DbGaPParser):
     def _get_element_type(self):
         return "Kids First"
 
+
+class BioLINCCDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "BioLINCC"
+
+
+class Covid19DbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "COVID19"
+
+
+class DIRDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "DIR"
+
+
+class LungMAPDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "LungMAP"
+
+
+class NSRRDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "NSRR"
+
+
+class ParentDBGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "Parent"
+
+
+class PCGCDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "PCGC"
+
+
+class RECOVERDBGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "RECOVER"
+
+
+class TopmedDBGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "TOPMed"
+
+
+class CureSC(DbGaPParser):
+    def _get_element_type(self):
+        return "CureSC"
diff --git a/src/dug/core/parsers/topmed_csv_parser.py b/src/dug/core/parsers/topmed_csv_parser.py
@@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
                 elem = DugElement(elem_id=row['variable_full_accession'],
                                   name=row['variable_name'],
                                   desc=row['variable_desc'],
-                                  elem_type="dbGaP",
+                                  elem_type="TOPMed",
                                   collection_id=row['study_full_accession'],
                                   collection_name=row['study_name'])
 

diff --git a/src/dug/core/parsers/topmed_tag_parser.py b/src/dug/core/parsers/topmed_tag_parser.py
@@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
                     elem_id=row['variable_full_accession'],
                     name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'],
                     desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'],
-                    elem_type="dbGaP",
+                    elem_type="TOPMed",
                     collection_id=row['study_full_accession'],
                     collection_name=row['study_name']
                 )