Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Jan 18, 2024
1 parent 57782a6 commit b119863
Showing 1 changed file with 90 additions and 25 deletions.
115 changes: 90 additions & 25 deletions scripts/genes-integrate-diseases.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pronto
import yaml
from loguru import logger
from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, Field


class GeneXlink(BaseModel):
Expand Down Expand Up @@ -340,13 +340,26 @@ class PanelappConfidence(enum.Enum):
"""Confidence level for PanelApp."""

#: Green
GREEN = "3"
GREEN = "GREEN"
#: Amber
AMBER = "2"
AMBER = "AMBER"
#: Red
RED = "1"
RED = "RED"
#: None
NONE = "0"
NONE = "NONE"

def __lt__(self, other: "PanelappConfidence") -> bool:
"""Compare confidence levels.
GREEN is smallest for sorting ergnonomics.
"""
mapping = {
PanelappConfidence.GREEN: 0,
PanelappConfidence.AMBER: 1,
PanelappConfidence.RED: 2,
PanelappConfidence.NONE: 3,
}
return mapping[self] < mapping[other]


@enum.unique
Expand Down Expand Up @@ -394,6 +407,13 @@ class PanelappAssociation(BaseModel):
def parse_panelapp_jsonl(
path: str, xlink_by_gene_symbol: Dict[str, GeneXlink]
) -> List[PanelappAssociation]:
confidence_level_mapping = {
"3": PanelappConfidence.GREEN.value,
"2": PanelappConfidence.AMBER.value,
"1": PanelappConfidence.RED.value,
"0": PanelappConfidence.NONE.value,
}

result = []
with open(path, "rt") as inputf:
for line in inputf:
Expand All @@ -411,6 +431,7 @@ def parse_panelapp_jsonl(
f"Skipping record without HGNC ID and unmappable gene symbol: {record['gene_data']['gene_symbol']}"
)
continue
record["confidence_level"] = confidence_level_mapping[record["confidence_level"]]
assoc = PanelappAssociation(
hgnc_id=hgnc_id,
confidence_level=PanelappConfidence(record["confidence_level"]),
Expand Down Expand Up @@ -537,7 +558,10 @@ class ConfidenceLevel(enum.Enum):
LOW = "LOW"

def __lt__(self, other: "ConfidenceLevel") -> bool:
"""Compare confidence levels."""
"""Compare confidence levels.
HIGH is smallest for sorting ergnonomics.
"""
mapping = {
ConfidenceLevel.HIGH: 3,
ConfidenceLevel.MEDIUM: 2,
Expand All @@ -562,15 +586,6 @@ def __lt__(self, other: "ConfidenceLevel") -> bool:
return self.value < other.value


class PanelAppEvidence(BaseModel):
"""A panel showing this association."""

#: The panel.
panel: PanelappPanel
#: The raw strings used for adding this link.
raw_phenotypes: List[str]


class GeneDiseaseAssociationEntry(BaseModel):
"""A source gene-disease association entry."""

Expand Down Expand Up @@ -605,7 +620,7 @@ class GeneDiseaseAssociation(BaseModel):
#: List of primary disease identifiers that diseases came from.
labeled_disorders: Tuple[LabeledDisorder, ...]
#: List of disease identifiers, used for clustering.
disease_ids: Tuple[str, ...]
disease_ids: Tuple[str, ...] = Field(..., exclude=True)
#: Disease name for display, if any.
disease_name: Optional[str]
#: Disease definition from Orphanet, if available.
Expand Down Expand Up @@ -732,6 +747,28 @@ def parse_do_omim_import(path: str) -> List[DoLabel]:
return result


class GeneConditionAssociation(BaseModel):
"""Diseases and PanelApp information linked to a gene."""

#: Gene HGNC ID.
hgnc_id: str
#: The gene disease association
disease_associations: Tuple[GeneDiseaseAssociation, ...]
#: PanelApp panels with related genes.
panelapp_associations: Tuple[PanelappAssociation, ...]


class ResultContainer(BaseModel):
"""Container for results."""

#: The results.
results: Tuple[GeneConditionAssociation, ...]

def __lt__(self, other: "ResultContainer") -> bool:
"""Compare two result containers."""
return self.results < other.results


#: Some manual OMIM labels that are on MONDO exclusion lists etc.
MANUAL_OMIM_LABELS = {
"OMIM:300932": "Thyroxine-binding globulin QTL",
Expand Down Expand Up @@ -837,6 +874,8 @@ def __init__(self):
"""Initialise the integrator."""
#: Mapping from `(gene_hgnc_id, disease_id)` to `GeneDiseaseAssociationEntry`.
self.disease_assocs: Dict[GeneDiseaseKey, GeneDiseaseAssociation] = {}
#: Mapping from `hgnc_id` to list of `PanelappAssociation`s.
self.panelapp_assocs: Dict[str, List[PanelappAssociation]] = {}

def register_disease_assoc(self, assoc: GeneDiseaseAssociation):
"""Register a gene-disease association."""
Expand Down Expand Up @@ -864,17 +903,43 @@ def run(self, pickle_path: Optional[str] = None):
self.handle_orpha()
self.handle_mim2gene_medgen()
self.handle_panelapp()

conditions_by_hgnc = {
hgnc_id: GeneConditionAssociation(
hgnc_id=hgnc_id, disease_associations=[], panelapp_associations=[]
)
for hgnc_id in sorted(
set(
chain(
(k.hgnc_id for k in self.disease_assocs.keys()), self.panelapp_assocs.keys()
)
)
)
}
for hgnc_id, assocs in self.panelapp_assocs.items():
conditions_by_hgnc[hgnc_id] = conditions_by_hgnc[hgnc_id].model_copy(
update={
"panelapp_associations": tuple(sorted(assocs, key=lambda a: a.confidence_level))
}
)
for key, assoc in self.disease_assocs.items():
disease_assoc = conditions_by_hgnc[key.hgnc_id].disease_associations
conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy(
update={
"disease_associations": tuple(
sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence)
)
}
)
result = ResultContainer(results=tuple(conditions_by_hgnc.values()))
yaml.dump(
[
assoc.model_dump(mode="json")
for assoc in sorted(set(self.disease_assocs.values()), key=lambda x: x.hgnc_id)
],
result.model_dump(mode="json"),
sys.stdout,
default_flow_style=False,
# default_flow_style=False,
)
logger.info("All done. Have a nice day.")

def load_data(self, pickle_path: Optional[str] = None):
def load_data(self, pickle_path: Optional[str] = None): # noqa: C901
if os.path.exists(pickle_path):
logger.info("unpickling...")
with gzip.open(pickle_path, "rb") as inputf:
Expand Down Expand Up @@ -1132,7 +1197,7 @@ def handle_orpha(self):
)
)

def handle_panelapp(self):
def handle_panelapp(self): # noqa: C901
"""Process the PanelApp data."""
RE_OMIM = r"(?:O?MIM:)?(\d\d\d\d\d\d)"
RE_MONDO = r"MONDO:(\d+)"
Expand All @@ -1143,6 +1208,8 @@ def handle_panelapp(self):
disease.database_id: disease for disease in self.orpha_disorders
}
for assoc in self.panelapp_associations:
self.panelapp_assocs.setdefault(assoc.hgnc_id, []).append(assoc)

if assoc.confidence_level == PanelappConfidence.NONE.value:
continue
if assoc.phenotypes:
Expand Down Expand Up @@ -1191,8 +1258,6 @@ def handle_panelapp(self):
.replace("}", "")
)

# if "ORPHA:189427-definition" in orpha_ids:
# import pdb; pdb.set_trace()
labeled_omim_ids = [
LabeledDisorder(
term_id=omim_id, title=self.omim_titles.get(omim_id, stripped_phenotype)
Expand Down

0 comments on commit b119863

Please sign in to comment.