diff --git a/src/dug/config.py b/src/dug/config.py index d020ee23..3014e96f 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -47,7 +47,7 @@ class Config: }, "sapbert": { "classification_url": "https://med-nemo.apps.renci.org/annotate/", - "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + "annotator_url": "https://sap-qdrant.apps.renci.org/annotate/", }, } ) diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index 05890517..6d61d008 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -7,6 +7,7 @@ from dug import utils as utils from requests import Session import bmt +from retrying import retry logger = logging.getLogger("dug") @@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session): result = self.handle_response(curie, response) return result + @retry(stop_max_attempt_number=3) def make_request(self, curie: str, http_session: Session): # Get response from namelookup reverse lookup op # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 6f2c93a6..e252e521 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -1,8 +1,7 @@ import logging from typing import List from requests import Session -import json - +from retrying import retry from dug.core.annotators._base import DugIdentifier, Input from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer @@ -26,6 +25,7 @@ def __init__( ): self.classificationUrl = kwargs.get('classification_url') self.annotatorUrl = kwargs.get('annotator_url') + if not self.classificationUrl: raise TypeError('Classification url needs to be defined for sapbert annotator') if not self.annotatorUrl: @@ -35,7 +35,12 @@ def __init__( self.ontology_greenlist = ontology_greenlist self.norm_fails_file = "norm_fails.txt" self.anno_fails_file = "anno_fails.txt" + # threshold marking cutoff point + self.score_threshold = kwargs.get("score_threshold", 0.8) + # indicate if we want values above or below the threshold. + self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False + @retry(stop_max_attempt_number=3) def __call__(self, text, http_session) -> List[DugIdentifier]: # Fetch identifiers classifiers: List = self.text_classification(text, http_session) @@ -132,7 +137,7 @@ def handle_classification_response(self, response: dict) -> List: text = denotation.get("text", None) bl_type = denotation.get("obj", None) classifiers.append( - {"text": text, "bl_type": bl_type.replace("biolink:", "")} + {"text": text, "bl_type": bl_type} ) return classifiers @@ -184,7 +189,7 @@ def make_annotation_request(self, term_dict: Input, http_session: Session): payload = { "text": term_dict["text"], "model_name": "sapbert", - "count": 1000, + "count": 10, "args": {"bl_type": term_dict["bl_type"]}, } # This could be moved to a config file @@ -213,36 +218,14 @@ def handle_annotation_response(self, value, response: dict) -> List[DugIdentifie continue biolink_type = identifier.get('category') - score = identifier.get("score", None) + score = identifier.get("score", 0) label = identifier.get("name") - identifiers.append( - DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) - ) + if score >= self.score_threshold and self.score_direction_up: + identifiers.append( + DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) + ) + elif score <= self.score_threshold and not self.score_direction_up: + identifiers.append( + DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) + ) return identifiers - -## Testing Purposes -# if __name__ == "__main__": -# from dug.config import Config -# import json -# import redis -# from requests_cache import CachedSession -# from dug.core.annotators._base import DefaultNormalizer, DefaultSynonymFinder - -# config = Config.from_env() -# annotator = AnnotateSapbert( -# normalizer=DefaultNormalizer(**config.normalizer), -# synonym_finder=DefaultSynonymFinder(**config.synonym_service), -# ) - -# redis_config = { -# "host": "localhost", -# "port": config.redis_port, -# "password": config.redis_password, -# } - -# http_sesh = CachedSession( -# cache_name="annotator", -# backend="redis", -# connection=redis.StrictRedis(**redis_config), -# ) -# annotator(text="Have you ever had a heart attack?", http_session=http_sesh) diff --git a/src/dug/core/concept_expander.py b/src/dug/core/concept_expander.py index 4dc56494..bc8eef50 100644 --- a/src/dug/core/concept_expander.py +++ b/src/dug/core/concept_expander.py @@ -31,7 +31,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ with open(kg_filename, 'r') as stream: response = json.load(stream) else: - query = query_factory._get_var_query(identifier) + query = query_factory.get_query(identifier) logger.debug(query) response = requests.post( url=self.url, diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index 8a43ab22..c73c6c68 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -4,7 +4,7 @@ import pluggy from ._base import DugElement, DugConcept, Indexable, Parser, FileParser -from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser +from .dbgap_parser import * from .nida_parser import NIDAParser from .scicrunch_parser import SciCrunchParser from .topmed_tag_parser import TOPMedTagParser @@ -36,8 +36,19 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies") parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs") parser_dict["ctn"] = CTNParser() + parser_dict["biolincc"] = BioLINCCDbGaPParser() + parser_dict["covid19"] = Covid19DbGaPParser() + parser_dict["dir"] = DIRDbGaPParser() + parser_dict["lungmap"] = LungMAPDbGaPParser() + parser_dict["nsrr"] = NSRRDbGaPParser() + parser_dict["parent"] = ParentDBGaPParser() + parser_dict["pcgc"] = PCGCDbGaPParser() + parser_dict["recover"] = RECOVERDBGaPParser() + parser_dict["topmeddbgap"] = TopmedDBGaPParser() + parser_dict["curesc"] = CureSC() parser_dict["radx"] = RADxParser() + diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index a362d028..5181b320 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -92,7 +92,57 @@ class CRDCDbGaPParser(DbGaPParser): def _get_element_type(self): return "Cancer Data Commons" + class KFDRCDbGaPParser(DbGaPParser): def _get_element_type(self): return "Kids First" + +class BioLINCCDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "BioLINCC" + + +class Covid19DbGaPParser(DbGaPParser): + def _get_element_type(self): + return "COVID19" + + +class DIRDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "DIR" + + +class LungMAPDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "LungMAP" + + +class NSRRDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "NSRR" + + +class ParentDBGaPParser(DbGaPParser): + def _get_element_type(self): + return "Parent" + + +class PCGCDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "PCGC" + + +class RECOVERDBGaPParser(DbGaPParser): + def _get_element_type(self): + return "RECOVER" + + +class TopmedDBGaPParser(DbGaPParser): + def _get_element_type(self): + return "TOPMed" + + +class CureSC(DbGaPParser): + def _get_element_type(self): + return "CureSC" diff --git a/src/dug/core/parsers/topmed_csv_parser.py b/src/dug/core/parsers/topmed_csv_parser.py index 710bcb63..9725f247 100644 --- a/src/dug/core/parsers/topmed_csv_parser.py +++ b/src/dug/core/parsers/topmed_csv_parser.py @@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem = DugElement(elem_id=row['variable_full_accession'], name=row['variable_name'], desc=row['variable_desc'], - elem_type="dbGaP", + elem_type="TOPMed", collection_id=row['study_full_accession'], collection_name=row['study_name']) diff --git a/src/dug/core/parsers/topmed_tag_parser.py b/src/dug/core/parsers/topmed_tag_parser.py index f10ed43d..88f04493 100644 --- a/src/dug/core/parsers/topmed_tag_parser.py +++ b/src/dug/core/parsers/topmed_tag_parser.py @@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem_id=row['variable_full_accession'], name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'], desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'], - elem_type="dbGaP", + elem_type="TOPMed", collection_id=row['study_full_accession'], collection_name=row['study_name'] )