Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new parsers and fun things annotation change, config additions for t… #356

Merged
merged 6 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class Config:
},
"sapbert": {
"classification_url": "https://med-nemo.apps.renci.org/annotate/",
"annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
"annotator_url": "https://sap-qdrant.apps.renci.org/annotate/",
},
}
)
Expand Down
2 changes: 2 additions & 0 deletions src/dug/core/annotators/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dug import utils as utils
from requests import Session
import bmt
from retrying import retry

logger = logging.getLogger("dug")

Expand Down Expand Up @@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session):
result = self.handle_response(curie, response)
return result

@retry(stop_max_attempt_number=3)
def make_request(self, curie: str, http_session: Session):
# Get response from namelookup reverse lookup op
# example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)
Expand Down
53 changes: 18 additions & 35 deletions src/dug/core/annotators/sapbert_annotator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import logging
from typing import List
from requests import Session
import json

from retrying import retry
from dug.core.annotators._base import DugIdentifier, Input
from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer

Expand All @@ -26,6 +25,7 @@ def __init__(
):
self.classificationUrl = kwargs.get('classification_url')
self.annotatorUrl = kwargs.get('annotator_url')

if not self.classificationUrl:
raise TypeError('Classification url needs to be defined for sapbert annotator')
if not self.annotatorUrl:
Expand All @@ -35,7 +35,12 @@ def __init__(
self.ontology_greenlist = ontology_greenlist
self.norm_fails_file = "norm_fails.txt"
self.anno_fails_file = "anno_fails.txt"
# threshold marking cutoff point
self.score_threshold = kwargs.get("score_threshold", 0.8)
# indicate if we want values above or below the threshold.
self.score_direction_up = True if kwargs.get("score_direction", "up") == "up" else False

@retry(stop_max_attempt_number=3)
def __call__(self, text, http_session) -> List[DugIdentifier]:
# Fetch identifiers
classifiers: List = self.text_classification(text, http_session)
Expand Down Expand Up @@ -132,7 +137,7 @@ def handle_classification_response(self, response: dict) -> List:
text = denotation.get("text", None)
bl_type = denotation.get("obj", None)
classifiers.append(
{"text": text, "bl_type": bl_type.replace("biolink:", "")}
{"text": text, "bl_type": bl_type}
)
return classifiers

Expand Down Expand Up @@ -184,7 +189,7 @@ def make_annotation_request(self, term_dict: Input, http_session: Session):
payload = {
"text": term_dict["text"],
"model_name": "sapbert",
"count": 1000,
"count": 10,
"args": {"bl_type": term_dict["bl_type"]},
}
# This could be moved to a config file
Expand Down Expand Up @@ -213,36 +218,14 @@ def handle_annotation_response(self, value, response: dict) -> List[DugIdentifie
continue

biolink_type = identifier.get('category')
score = identifier.get("score", None)
score = identifier.get("score", 0)
label = identifier.get("name")
identifiers.append(
DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
)
if score >= self.score_threshold and self.score_direction_up:
identifiers.append(
DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
)
elif score <= self.score_threshold and not self.score_direction_up:
identifiers.append(
DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text)
)
return identifiers

## Testing Purposes
# if __name__ == "__main__":
# from dug.config import Config
# import json
# import redis
# from requests_cache import CachedSession
# from dug.core.annotators._base import DefaultNormalizer, DefaultSynonymFinder

# config = Config.from_env()
# annotator = AnnotateSapbert(
# normalizer=DefaultNormalizer(**config.normalizer),
# synonym_finder=DefaultSynonymFinder(**config.synonym_service),
# )

# redis_config = {
# "host": "localhost",
# "port": config.redis_port,
# "password": config.redis_password,
# }

# http_sesh = CachedSession(
# cache_name="annotator",
# backend="redis",
# connection=redis.StrictRedis(**redis_config),
# )
# annotator(text="Have you ever had a heart attack?", http_session=http_sesh)
2 changes: 1 addition & 1 deletion src/dug/core/concept_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_
with open(kg_filename, 'r') as stream:
response = json.load(stream)
else:
query = query_factory._get_var_query(identifier)
query = query_factory.get_query(identifier)
logger.debug(query)
response = requests.post(
url=self.url,
Expand Down
13 changes: 12 additions & 1 deletion src/dug/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pluggy

from ._base import DugElement, DugConcept, Indexable, Parser, FileParser
from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser
from .dbgap_parser import *
from .nida_parser import NIDAParser
from .scicrunch_parser import SciCrunchParser
from .topmed_tag_parser import TOPMedTagParser
Expand Down Expand Up @@ -36,8 +36,19 @@ def define_parsers(parser_dict: Dict[str, Parser]):
parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs")
parser_dict["ctn"] = CTNParser()
parser_dict["biolincc"] = BioLINCCDbGaPParser()
parser_dict["covid19"] = Covid19DbGaPParser()
parser_dict["dir"] = DIRDbGaPParser()
parser_dict["lungmap"] = LungMAPDbGaPParser()
parser_dict["nsrr"] = NSRRDbGaPParser()
parser_dict["parent"] = ParentDBGaPParser()
parser_dict["pcgc"] = PCGCDbGaPParser()
parser_dict["recover"] = RECOVERDBGaPParser()
parser_dict["topmeddbgap"] = TopmedDBGaPParser()
parser_dict["curesc"] = CureSC()
parser_dict["radx"] = RADxParser()





Expand Down
50 changes: 50 additions & 0 deletions src/dug/core/parsers/dbgap_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,57 @@ class CRDCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "Cancer Data Commons"


class KFDRCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "Kids First"


class BioLINCCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "BioLINCC"


class Covid19DbGaPParser(DbGaPParser):
def _get_element_type(self):
return "COVID19"


class DIRDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "DIR"


class LungMAPDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "LungMAP"


class NSRRDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "NSRR"


class ParentDBGaPParser(DbGaPParser):
def _get_element_type(self):
return "Parent"


class PCGCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "PCGC"


class RECOVERDBGaPParser(DbGaPParser):
def _get_element_type(self):
return "RECOVER"


class TopmedDBGaPParser(DbGaPParser):
def _get_element_type(self):
return "TOPMed"


class CureSC(DbGaPParser):
def _get_element_type(self):
return "CureSC"
2 changes: 1 addition & 1 deletion src/dug/core/parsers/topmed_csv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem = DugElement(elem_id=row['variable_full_accession'],
name=row['variable_name'],
desc=row['variable_desc'],
elem_type="dbGaP",
elem_type="TOPMed",
collection_id=row['study_full_accession'],
collection_name=row['study_name'])

Expand Down
2 changes: 1 addition & 1 deletion src/dug/core/parsers/topmed_tag_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem_id=row['variable_full_accession'],
name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'],
desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'],
elem_type="dbGaP",
elem_type="TOPMed",
collection_id=row['study_full_accession'],
collection_name=row['study_name']
)
Expand Down
Loading