From 23346506469998e73aac33b13beec1c76f4337fc Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 13 May 2024 18:15:58 -0400 Subject: [PATCH 1/5] new parsers and fun things annotation change, config additions for threshold & direction --- src/dug/config.py | 2 +- src/dug/core/annotators/sapbert_annotator.py | 22 ++++++++---- src/dug/core/parsers/__init__.py | 11 +++++- src/dug/core/parsers/dbgap_parser.py | 35 ++++++++++++++++++++ src/dug/core/parsers/topmed_csv_parser.py | 2 +- src/dug/core/parsers/topmed_tag_parser.py | 2 +- 6 files changed, 64 insertions(+), 10 deletions(-) diff --git a/src/dug/config.py b/src/dug/config.py index b070cac1..90d1b0ae 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -43,7 +43,7 @@ class Config: }, "sapbert": { "classification_url": "https://med-nemo.apps.renci.org/annotate/", - "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + "annotator_url": "https://sap-qdrant.apps.renci.org/annotate/", }, } ) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 6f2c93a6..b8de1762 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -26,6 +26,7 @@ def __init__( ): self.classificationUrl = kwargs.get('classification_url') self.annotatorUrl = kwargs.get('annotator_url') + if not self.classificationUrl: raise TypeError('Classification url needs to be defined for sapbert annotator') if not self.annotatorUrl: @@ -35,6 +36,10 @@ def __init__( self.ontology_greenlist = ontology_greenlist self.norm_fails_file = "norm_fails.txt" self.anno_fails_file = "anno_fails.txt" + # threshold marking cutoff point + self.score_threshold = kwargs.get("score_threshold", 0.8) + # indicate if we want values above or below the threshold. + self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False def __call__(self, text, http_session) -> List[DugIdentifier]: # Fetch identifiers @@ -132,7 +137,7 @@ def handle_classification_response(self, response: dict) -> List: text = denotation.get("text", None) bl_type = denotation.get("obj", None) classifiers.append( - {"text": text, "bl_type": bl_type.replace("biolink:", "")} + {"text": text, "bl_type": bl_type} ) return classifiers @@ -184,7 +189,7 @@ def make_annotation_request(self, term_dict: Input, http_session: Session): payload = { "text": term_dict["text"], "model_name": "sapbert", - "count": 1000, + "count": 10, "args": {"bl_type": term_dict["bl_type"]}, } # This could be moved to a config file @@ -213,11 +218,16 @@ def handle_annotation_response(self, value, response: dict) -> List[DugIdentifie continue biolink_type = identifier.get('category') - score = identifier.get("score", None) + score = identifier.get("score", 0) label = identifier.get("name") - identifiers.append( - DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) - ) + if score >= self.score_threshold and self.score_direction_up: + identifiers.append( + DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) + ) + elif score <= self.score_threshold and not self.score_direction_up: + identifiers.append( + DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) + ) return identifiers ## Testing Purposes diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index aeec0516..c1a5102e 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -4,7 +4,7 @@ import pluggy from ._base import DugElement, DugConcept, Indexable, Parser, FileParser -from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser +from .dbgap_parser import * from .nida_parser import NIDAParser from .scicrunch_parser import SciCrunchParser from .topmed_tag_parser import TOPMedTagParser @@ -35,6 +35,15 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies") parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs") parser_dict["ctn"] = CTNParser() + parser_dict["biolincc"] = BioLINCCDbGaPParser() + parser_dict["covid19"] = Covid19DbGaPParser() + parser_dict["dir"] = DIRDbGaPParser() + parser_dict["lungmap"] = LungMAPDbGaPParser() + parser_dict["nsrr"] = NSRRDbGaPParser() + parser_dict["parent"] = ParentDBGaPParser() + parser_dict["pcgc"] = PCGCDbGaPParser() + parser_dict["recover"] = RECOVERDBGaPParser() + parser_dict["topmeddbgap"] = TopmedDBGaPParser() diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index a362d028..d08e5b6e 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -96,3 +96,38 @@ class KFDRCDbGaPParser(DbGaPParser): def _get_element_type(self): return "Kids First" +class BioLINCCDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "BioLINCC" + +class Covid19DbGaPParser(DbGaPParser): + def _get_element_type(self): + return "COVID19" + +class DIRDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "DIR" + +class LungMAPDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "LungMAP" + +class NSRRDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "NSRR" + +class ParentDBGaPParser(DbGaPParser): + def _get_element_type(self): + return "Parent" + +class PCGCDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "PCGC" + +class RECOVERDBGaPParser(DbGaPParser): + def _get_element_type(self): + return "RECOVER" + +class TopmedDBGaPParser(DbGaPParser): + def _get_element_type(self): + return "TOPMed" \ No newline at end of file diff --git a/src/dug/core/parsers/topmed_csv_parser.py b/src/dug/core/parsers/topmed_csv_parser.py index 710bcb63..9725f247 100644 --- a/src/dug/core/parsers/topmed_csv_parser.py +++ b/src/dug/core/parsers/topmed_csv_parser.py @@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem = DugElement(elem_id=row['variable_full_accession'], name=row['variable_name'], desc=row['variable_desc'], - elem_type="dbGaP", + elem_type="TOPMed", collection_id=row['study_full_accession'], collection_name=row['study_name']) diff --git a/src/dug/core/parsers/topmed_tag_parser.py b/src/dug/core/parsers/topmed_tag_parser.py index f10ed43d..88f04493 100644 --- a/src/dug/core/parsers/topmed_tag_parser.py +++ b/src/dug/core/parsers/topmed_tag_parser.py @@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem_id=row['variable_full_accession'], name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'], desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'], - elem_type="dbGaP", + elem_type="TOPMed", collection_id=row['study_full_accession'], collection_name=row['study_name'] ) From e2b215c9c0d941088f49129b80679d612e814277 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 27 May 2024 11:38:45 -0400 Subject: [PATCH 2/5] retry --- src/dug/core/annotators/_base.py | 2 ++ src/dug/core/annotators/sapbert_annotator.py | 31 ++------------------ 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index 05890517..6d61d008 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -7,6 +7,7 @@ from dug import utils as utils from requests import Session import bmt +from retrying import retry logger = logging.getLogger("dug") @@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session): result = self.handle_response(curie, response) return result + @retry(stop_max_attempt_number=3) def make_request(self, curie: str, http_session: Session): # Get response from namelookup reverse lookup op # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index b8de1762..e0e84837 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -1,8 +1,7 @@ import logging from typing import List from requests import Session -import json - +from retrying import retry from dug.core.annotators._base import DugIdentifier, Input from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer @@ -41,6 +40,7 @@ def __init__( # indicate if we want values above or below the threshold. self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False + @retry(max_attempts=3) def __call__(self, text, http_session) -> List[DugIdentifier]: # Fetch identifiers classifiers: List = self.text_classification(text, http_session) @@ -229,30 +229,3 @@ def handle_annotation_response(self, value, response: dict) -> List[DugIdentifie DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) ) return identifiers - -## Testing Purposes -# if __name__ == "__main__": -# from dug.config import Config -# import json -# import redis -# from requests_cache import CachedSession -# from dug.core.annotators._base import DefaultNormalizer, DefaultSynonymFinder - -# config = Config.from_env() -# annotator = AnnotateSapbert( -# normalizer=DefaultNormalizer(**config.normalizer), -# synonym_finder=DefaultSynonymFinder(**config.synonym_service), -# ) - -# redis_config = { -# "host": "localhost", -# "port": config.redis_port, -# "password": config.redis_password, -# } - -# http_sesh = CachedSession( -# cache_name="annotator", -# backend="redis", -# connection=redis.StrictRedis(**redis_config), -# ) -# annotator(text="Have you ever had a heart attack?", http_session=http_sesh) From 3a21ecd5fef982380e7cbbba771425f531ffcf43 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 27 May 2024 12:33:31 -0400 Subject: [PATCH 3/5] retry --- src/dug/core/annotators/sapbert_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index e0e84837..e252e521 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -40,7 +40,7 @@ def __init__( # indicate if we want values above or below the threshold. self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False - @retry(max_attempts=3) + @retry(stop_max_attempt_number=3) def __call__(self, text, http_session) -> List[DugIdentifier]: # Fetch identifiers classifiers: List = self.text_classification(text, http_session) From f621f17dbd4458ec23c1cbdced70dd199178808f Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 5 Jun 2024 13:28:00 -0400 Subject: [PATCH 4/5] adding cure sc parser --- src/dug/core/parsers/__init__.py | 1 + src/dug/core/parsers/dbgap_parser.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index c1a5102e..56533fc0 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -44,6 +44,7 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["pcgc"] = PCGCDbGaPParser() parser_dict["recover"] = RECOVERDBGaPParser() parser_dict["topmeddbgap"] = TopmedDBGaPParser() + parser_dict["curesc"] = CureSC() diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index d08e5b6e..5181b320 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -92,42 +92,57 @@ class CRDCDbGaPParser(DbGaPParser): def _get_element_type(self): return "Cancer Data Commons" + class KFDRCDbGaPParser(DbGaPParser): def _get_element_type(self): return "Kids First" + class BioLINCCDbGaPParser(DbGaPParser): def _get_element_type(self): return "BioLINCC" + class Covid19DbGaPParser(DbGaPParser): def _get_element_type(self): return "COVID19" + class DIRDbGaPParser(DbGaPParser): def _get_element_type(self): return "DIR" + class LungMAPDbGaPParser(DbGaPParser): def _get_element_type(self): return "LungMAP" + class NSRRDbGaPParser(DbGaPParser): def _get_element_type(self): return "NSRR" + class ParentDBGaPParser(DbGaPParser): def _get_element_type(self): return "Parent" + class PCGCDbGaPParser(DbGaPParser): def _get_element_type(self): return "PCGC" + class RECOVERDBGaPParser(DbGaPParser): def _get_element_type(self): return "RECOVER" + class TopmedDBGaPParser(DbGaPParser): def _get_element_type(self): - return "TOPMed" \ No newline at end of file + return "TOPMed" + + +class CureSC(DbGaPParser): + def _get_element_type(self): + return "CureSC" From 807aafa635867c6bfbe1979f341e57bb99bd5094 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 6 Jun 2024 17:43:57 -0400 Subject: [PATCH 5/5] fix tranqlizer --- src/dug/core/concept_expander.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/concept_expander.py b/src/dug/core/concept_expander.py index 4dc56494..bc8eef50 100644 --- a/src/dug/core/concept_expander.py +++ b/src/dug/core/concept_expander.py @@ -31,7 +31,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ with open(kg_filename, 'r') as stream: response = json.load(stream) else: - query = query_factory._get_var_query(identifier) + query = query_factory.get_query(identifier) logger.debug(query) response = requests.post( url=self.url,