Skip to content

Commit

Permalink
feat: replace orphapacket by orphadata API access (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Jan 16, 2024
1 parent 988524d commit 07bfbe0
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 72 deletions.
7 changes: 3 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ rule all:
#
# genes
f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
f"work/download/genes/ordo/{DV.ordo}/ordo.csv",
"work/download/genes/alphamissense/1/AlphaMissense_gene_hg38.tsv.gz",
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
"work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
Expand All @@ -96,7 +95,7 @@ rule all:
f"work/genes/gnomad/{DV.gnomad_constraints}/gnomad_constraints.tsv",
f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl",
f"work/genes/omim/{DV.hpo}+{DV.today}/omim_diseases.tsv",
f"work/genes/ordo/{DV.ordo}/orpha_diseases.tsv",
f"work/genes/orphadata/{DV.orphadata}/orpha_diseases.tsv",
"work/genes/rcnv/2022/rcnv_collins_2022.tsv",
"work/genes/shet/2019/shet_weghorn_2019.tsv",
f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv",
Expand Down Expand Up @@ -177,7 +176,7 @@ rule all:
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
# ----- genes
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.ordo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
# -- worker data
f"output/full/worker/genes-regions-grch37-{DV.refseq_37}+{PV.worker}/refseq_genes.bin",
f"output/full/worker/genes-regions-grch37-{DV.ensembl_37}+{PV.worker}/ensembl_genes.bin",
Expand Down Expand Up @@ -350,7 +349,7 @@ include: "rules/work/genes/mehari_data_tx.smk"
include: "rules/work/genes/ncbi.smk"
include: "rules/work/genes/omim.smk"
include: "rules/work/genes/panelapp.smk"
include: "rules/work/genes/ordo.smk"
include: "rules/work/genes/orphadata.smk"
include: "rules/work/genes/rcnv.smk"
include: "rules/work/genes/shet.smk"
include: "rules/work/genes/domino.smk"
Expand Down
4 changes: 4 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ dependencies:
- varfish-server-worker =0.10.2
# S3 uploads
- s5cmd =2.1.0
# async HTTP requests
- httpx =0.25.0
- httpcore =0.18.0
- trio
3 changes: 3 additions & 0 deletions excerpt-data/a0f9f11118d32143/download
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/a0f9f11118d32143/url.txt
Git LFS file not shown
6 changes: 3 additions & 3 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl",
ncbi="work/genes/entrez/{date}/gene_info.jsonl",
omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
orpha="work/genes/ordo/{v_orpha}/orpha_diseases.tsv",
orpha="work/genes/orphadata/{date}/orpha_diseases.tsv",
panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl",
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
Expand All @@ -20,11 +20,11 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
decipher_hi="work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
output:
rocksdb_identity=(
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/"
"rocksdb/IDENTITY"
),
spec_yaml=(
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/"
"spec.yaml"
),
wildcard_constraints:
Expand Down
34 changes: 0 additions & 34 deletions rules/work/genes/ordo.smk

This file was deleted.

21 changes: 21 additions & 0 deletions rules/work/genes/orphadata.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Rules related to ORDO download


rule genes_ordo_convert: # -- postprocess file for HGNC gene IDs
input:
xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv",
output:
tsv="work/genes/orphadata/{version}/orpha_diseases.tsv",
tsv_md5="work/genes/orphadata/{version}/orpha_diseases.tsv.md5",
shell:
"""
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" ERR EXIT
python ./scripts/genes-orpha-diseases.py {input.xlink} \
| qsv sort -d '\t' \
| qsv fmt -t '\t' \
> {output.tsv}
md5sum {output.tsv} > {output.tsv}.md5
"""
130 changes: 102 additions & 28 deletions scripts/genes-orpha-diseases.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,112 @@
#!/usr/bin/env python
"""Helper script to extract gene-disease association from ORDO CSV."""
"""Helper script to retrieve ORDO gene-disease associations from OrphaData."""

import csv
import json
import pathlib
import sys

import httpx
import trio

def main():
records = {}
#: URL for listing ORPHAcode.
URL_ORPHACODE_LIST = "https://api.orphadata.com/rd-associated-genes/orphacodes"
#: URL template for getting information on one ORPHAcode.
URL_ORPHACODE_GET = "https://api.orphadata.com/rd-cross-referencing/orphacodes/{}?lang=en"
#: URL template for getting enes on one ORPHAcode.
URL_ORPHACODE_GET_GENE = "https://api.orphadata.com/rd-associated-genes/orphacodes/{}"


async def main():
with open(sys.argv[1], "rt") as inputf:
reader = csv.DictReader(inputf, delimiter=",")
for record in reader:
records[record["gene_symbol"]] = record["hgnc_id"]

print(f"# xlink entries: {len(records)}", file=sys.stderr)

base_path = pathlib.Path(sys.argv[2])
print("\t".join(["hgnc_id", "orpha_id", "disease_name"]))
for json_path in sorted(base_path.glob("*.json")):
with json_path.open("rt") as inputf:
data = json.load(inputf)
elem_top = data["Orphapacket"]
if elem_top.get("DisorderType", {}).get("value") != "Disease":
continue # skip categories
disease_name = elem_top["Label"]
orpha_id = elem_top["PURL"].replace("http://www.orpha.net/ORDO/Orphanet_", "ORPHA:")
elem_genes = elem_top.get("Genes", [])
for elem_gene in elem_genes:
gene_symbol = elem_gene["Gene"]["Symbol"]
hgnc_id = symbol_to_hgnc.get(gene_symbol)
if hgnc_id: # skip if no HGNC ID exists, maybe withdrawn?
print("\t".join(map(str, [hgnc_id, orpha_id, disease_name])))
symbol_to_hgnc = {
row["gene_symbol"]: row["hgnc_id"] for row in csv.DictReader(inputf, delimiter="\t")
}

writer = csv.DictWriter(
sys.stdout,
fieldnames=[
"hgnc_id",
"orpha_id",
"assoc_status",
"omim_ids",
"disease_name",
"definition",
],
delimiter="\t",
)
writer.writeheader()

async with httpx.AsyncClient() as client:
print("Fetching ORPHAcode list...", file=sys.stderr)
lst = await client.get(URL_ORPHACODE_LIST)
print("...done", file=sys.stderr)
disease_ids = {disease["ORPHAcode"] for disease in lst.json()["data"]["results"]}

async def work(no: int, orpha_id: int, limiter: trio.CapacityLimiter):
async with limiter:
async with httpx.AsyncClient() as client:
try:
disease_infos = (await client.get(URL_ORPHACODE_GET.format(orpha_id))).json()
disease_genes = (
await client.get(URL_ORPHACODE_GET_GENE.format(orpha_id))
).json()
except Exception as e:
print(f"Error fetching {orpha_id}: {e}", file=sys.stderr)
raise
finally:
if no % 100 == 0:
print(
f"done fetching ORPHAcode details {no}/{len(disease_ids)}",
file=sys.stderr,
)

disease_info_results = disease_infos["data"]["results"]
disease_name = disease_info_results["Preferred term"]
summary = None
if disease_info_results.get("SummaryInformation", []):
summary = disease_info_results["SummaryInformation"][0].get("Definition", None)
omim_ids = []
for ref in disease_info_results.get("ExternalReference") or []:
if ref["Source"] == "OMIM":
omim_ids.append(f"OMIM:{ref['Reference']}")

for association in disease_genes["data"]["results"]["DisorderGeneAssociation"]:
assoc_status = association["DisorderGeneAssociationStatus"]
if not association["Gene"]["ExternalReference"]:
symbol = association["Gene"]["Symbol"]
hgnc_id = symbol_to_hgnc.get(symbol, None)
if hgnc_id:
writer.writerow(
{
"hgnc_id": hgnc_id,
"orpha_id": f"ORPHA:{orpha_id}",
"assoc_status": assoc_status,
"disease_name": disease_name,
"definition": summary,
"omim_ids": "|".join(omim_ids),
}
)
else:
for ref in association["Gene"]["ExternalReference"]:
if ref["Source"] == "HGNC":
hgnc_id = ref["Reference"]
writer.writerow(
{
"hgnc_id": f"HGNC:{hgnc_id}",
"orpha_id": f"ORPHA:{orpha_id}",
"assoc_status": assoc_status,
"disease_name": disease_name,
"definition": summary,
"omim_ids": "|".join(omim_ids),
}
)

print("Fetching ORPHAcode details...", file=sys.stderr)
limiter = trio.CapacityLimiter(10)
async with trio.open_nursery() as nursery:
for no, disease_id in enumerate(disease_ids):
nursery.start_soon(work, no, disease_id, limiter)
print("...done", file=sys.stderr)


if __name__ == "__main__":
main()
trio.run(main)
6 changes: 3 additions & 3 deletions varfish_db_downloader/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ class DataVersions:
acmg_sf: str
#: HPO
hpo: str
#: ORDO
ordo: str
#: Orphadata
orphadata: str
#: Pathogenic MMS
patho_mms: str
#: Mehari transcript data.
Expand Down Expand Up @@ -162,7 +162,7 @@ class DataVersions:
dbsnp="b151",
acmg_sf="3.1",
hpo="20230606",
ordo="4.4",
orphadata=TODAY,
patho_mms="20220730",
mehari_tx="0.4.4",
clinvar_release=CLINVAR_RELEASE,
Expand Down

0 comments on commit 07bfbe0

Please sign in to comment.