Skip to content

Commit

Permalink
Merge pull request #356 from helxplatform/dug-program-names-and-ner
Browse files Browse the repository at this point in the history
 new parsers and fun things annotation change, config additions for t…
  • Loading branch information
YaphetKG authored Jun 28, 2024
2 parents 9dbc791 + 0bac2c0 commit 70ae8f7
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 40 deletions.
2 changes: 1 addition & 1 deletion src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class Config:
},
"sapbert": {
"classification_url": "https://med-nemo.apps.renci.org/annotate/",
"annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
"annotator_url": "https://sap-qdrant.apps.renci.org/annotate/",
},
}
)
Expand Down
2 changes: 2 additions & 0 deletions src/dug/core/annotators/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dug import utils as utils
from requests import Session
import bmt
from retrying import retry

logger = logging.getLogger("dug")

Expand Down Expand Up @@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session):
result = self.handle_response(curie, response)
return result

@retry(stop_max_attempt_number=3)
def make_request(self, curie: str, http_session: Session):
# Get response from namelookup reverse lookup op
# example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)
Expand Down
53 changes: 18 additions & 35 deletions src/dug/core/annotators/sapbert_annotator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import logging
from typing import List
from requests import Session
import json

from retrying import retry
from dug.core.annotators._base import DugIdentifier, Input
from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer

Expand All @@ -26,6 +25,7 @@ def __init__(
):
self.classificationUrl = kwargs.get('classification_url')
self.annotatorUrl = kwargs.get('annotator_url')

if not self.classificationUrl:
raise TypeError('Classification url needs to be defined for sapbert annotator')
if not self.annotatorUrl:
Expand All @@ -35,7 +35,12 @@ def __init__(
self.ontology_greenlist = ontology_greenlist
self.norm_fails_file = "norm_fails.txt"
self.anno_fails_file = "anno_fails.txt"
# threshold marking cutoff point
self.score_threshold = kwargs.get("score_threshold", 0.8)
# indicate if we want values above or below the threshold.
self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False

@retry(stop_max_attempt_number=3)
def __call__(self, text, http_session) -> List[DugIdentifier]:
# Fetch identifiers
classifiers: List = self.text_classification(text, http_session)
Expand Down Expand Up @@ -132,7 +137,7 @@ def handle_classification_response(self, response: dict) -> List:
text = denotation.get("text", None)
bl_type = denotation.get("obj", None)
classifiers.append(
{"text": text, "bl_type": bl_type.replace("biolink:", "")}
{"text": text, "bl_type": bl_type}
)
return classifiers

Expand Down Expand Up @@ -184,7 +189,7 @@ def make_annotation_request(self, term_dict: Input, http_session: Session):
payload = {
"text": term_dict["text"],
"model_name": "sapbert",
"count": 1000,
"count": 10,
"args": {"bl_type": term_dict["bl_type"]},
}
# This could be moved to a config file
Expand Down Expand Up @@ -213,36 +218,14 @@ def handle_annotation_response(self, value, response: dict) -> List[DugIdentifie
continue

biolink_type = identifier.get('category')
score = identifier.get("score", None)
score = identifier.get("score", 0)
label = identifier.get("name")
identifiers.append(
DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
)
if score >= self.score_threshold and self.score_direction_up:
identifiers.append(
DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
)
elif score <= self.score_threshold and not self.score_direction_up:
identifiers.append(
DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
)
return identifiers

## Testing Purposes
# if __name__ == "__main__":
# from dug.config import Config
# import json
# import redis
# from requests_cache import CachedSession
# from dug.core.annotators._base import DefaultNormalizer, DefaultSynonymFinder

# config = Config.from_env()
# annotator = AnnotateSapbert(
# normalizer=DefaultNormalizer(**config.normalizer),
# synonym_finder=DefaultSynonymFinder(**config.synonym_service),
# )

# redis_config = {
# "host": "localhost",
# "port": config.redis_port,
# "password": config.redis_password,
# }

# http_sesh = CachedSession(
# cache_name="annotator",
# backend="redis",
# connection=redis.StrictRedis(**redis_config),
# )
# annotator(text="Have you ever had a heart attack?", http_session=http_sesh)
2 changes: 1 addition & 1 deletion src/dug/core/concept_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_
with open(kg_filename, 'r') as stream:
response = json.load(stream)
else:
query = query_factory._get_var_query(identifier)
query = query_factory.get_query(identifier)
logger.debug(query)
response = requests.post(
url=self.url,
Expand Down
13 changes: 12 additions & 1 deletion src/dug/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pluggy

from ._base import DugElement, DugConcept, Indexable, Parser, FileParser
from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser
from .dbgap_parser import *
from .nida_parser import NIDAParser
from .scicrunch_parser import SciCrunchParser
from .topmed_tag_parser import TOPMedTagParser
Expand Down Expand Up @@ -36,8 +36,19 @@ def define_parsers(parser_dict: Dict[str, Parser]):
parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs")
parser_dict["ctn"] = CTNParser()
parser_dict["biolincc"] = BioLINCCDbGaPParser()
parser_dict["covid19"] = Covid19DbGaPParser()
parser_dict["dir"] = DIRDbGaPParser()
parser_dict["lungmap"] = LungMAPDbGaPParser()
parser_dict["nsrr"] = NSRRDbGaPParser()
parser_dict["parent"] = ParentDBGaPParser()
parser_dict["pcgc"] = PCGCDbGaPParser()
parser_dict["recover"] = RECOVERDBGaPParser()
parser_dict["topmeddbgap"] = TopmedDBGaPParser()
parser_dict["curesc"] = CureSC()
parser_dict["radx"] = RADxParser()





Expand Down
50 changes: 50 additions & 0 deletions src/dug/core/parsers/dbgap_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,57 @@ class CRDCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "Cancer Data Commons"


class KFDRCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "Kids First"


class BioLINCCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "BioLINCC"


class Covid19DbGaPParser(DbGaPParser):
def _get_element_type(self):
return "COVID19"


class DIRDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "DIR"


class LungMAPDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "LungMAP"


class NSRRDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "NSRR"


class ParentDBGaPParser(DbGaPParser):
def _get_element_type(self):
return "Parent"


class PCGCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "PCGC"


class RECOVERDBGaPParser(DbGaPParser):
def _get_element_type(self):
return "RECOVER"


class TopmedDBGaPParser(DbGaPParser):
def _get_element_type(self):
return "TOPMed"


class CureSC(DbGaPParser):
def _get_element_type(self):
return "CureSC"
2 changes: 1 addition & 1 deletion src/dug/core/parsers/topmed_csv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem = DugElement(elem_id=row['variable_full_accession'],
name=row['variable_name'],
desc=row['variable_desc'],
elem_type="dbGaP",
elem_type="TOPMed",
collection_id=row['study_full_accession'],
collection_name=row['study_name'])

Expand Down
2 changes: 1 addition & 1 deletion src/dug/core/parsers/topmed_tag_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem_id=row['variable_full_accession'],
name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'],
desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'],
elem_type="dbGaP",
elem_type="TOPMed",
collection_id=row['study_full_accession'],
collection_name=row['study_name']
)
Expand Down

0 comments on commit 70ae8f7

Please sign in to comment.