From 07bfbe0b1e3c611274db5c528d846f8ccaf0ff89 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 16 Jan 2024 12:38:33 +0100 Subject: [PATCH] feat: replace orphapacket by orphadata API access (#84) --- Snakefile | 7 +- environment.yml | 4 + excerpt-data/a0f9f11118d32143/download | 3 + excerpt-data/a0f9f11118d32143/url.txt | 3 + rules/output/annonars/genes.smk | 6 +- rules/work/genes/ordo.smk | 34 ------- rules/work/genes/orphadata.smk | 21 ++++ scripts/genes-orpha-diseases.py | 130 +++++++++++++++++++------ varfish_db_downloader/versions.py | 6 +- 9 files changed, 142 insertions(+), 72 deletions(-) create mode 100644 excerpt-data/a0f9f11118d32143/download create mode 100644 excerpt-data/a0f9f11118d32143/url.txt delete mode 100644 rules/work/genes/ordo.smk create mode 100644 rules/work/genes/orphadata.smk diff --git a/Snakefile b/Snakefile index c33391d..dbe61d6 100644 --- a/Snakefile +++ b/Snakefile @@ -86,7 +86,6 @@ rule all: # # genes f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz", - f"work/download/genes/ordo/{DV.ordo}/ordo.csv", "work/download/genes/alphamissense/1/AlphaMissense_gene_hg38.tsv.gz", f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz", "work/genes/decipher/v3/decipher_hi_prediction.tsv.gz", @@ -96,7 +95,7 @@ rule all: f"work/genes/gnomad/{DV.gnomad_constraints}/gnomad_constraints.tsv", f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl", f"work/genes/omim/{DV.hpo}+{DV.today}/omim_diseases.tsv", - f"work/genes/ordo/{DV.ordo}/orpha_diseases.tsv", + f"work/genes/orphadata/{DV.orphadata}/orpha_diseases.tsv", "work/genes/rcnv/2022/rcnv_collins_2022.tsv", "work/genes/shet/2019/shet_weghorn_2019.tsv", f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv", @@ -177,7 +176,7 @@ rule all: f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY", # ----- genes - f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.ordo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY", + f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY", # -- worker data f"output/full/worker/genes-regions-grch37-{DV.refseq_37}+{PV.worker}/refseq_genes.bin", f"output/full/worker/genes-regions-grch37-{DV.ensembl_37}+{PV.worker}/ensembl_genes.bin", @@ -350,7 +349,7 @@ include: "rules/work/genes/mehari_data_tx.smk" include: "rules/work/genes/ncbi.smk" include: "rules/work/genes/omim.smk" include: "rules/work/genes/panelapp.smk" -include: "rules/work/genes/ordo.smk" +include: "rules/work/genes/orphadata.smk" include: "rules/work/genes/rcnv.smk" include: "rules/work/genes/shet.smk" include: "rules/work/genes/domino.smk" diff --git a/environment.yml b/environment.yml index 1f42695..8fb67af 100644 --- a/environment.yml +++ b/environment.yml @@ -47,3 +47,7 @@ dependencies: - varfish-server-worker =0.10.2 # S3 uploads - s5cmd =2.1.0 + # async HTTP requests + - httpx =0.25.0 + - httpcore =0.18.0 + - trio diff --git a/excerpt-data/a0f9f11118d32143/download b/excerpt-data/a0f9f11118d32143/download new file mode 100644 index 0000000..759232c --- /dev/null +++ b/excerpt-data/a0f9f11118d32143/download @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d4f7b8049f2238d04b0453d2422d7499fd7196162027b7a3b4f2f91ca9b16a2 +size 2509937 diff --git a/excerpt-data/a0f9f11118d32143/url.txt b/excerpt-data/a0f9f11118d32143/url.txt new file mode 100644 index 0000000..b8b04dd --- /dev/null +++ b/excerpt-data/a0f9f11118d32143/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa28fe18f2378e43812405874df6a303db980507aea705278cbc047d03b3d38c +size 118 diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk index 8247030..06e460a 100644 --- a/rules/output/annonars/genes.smk +++ b/rules/output/annonars/genes.smk @@ -11,7 +11,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl", ncbi="work/genes/entrez/{date}/gene_info.jsonl", omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv", - orpha="work/genes/ordo/{v_orpha}/orpha_diseases.tsv", + orpha="work/genes/orphadata/{date}/orpha_diseases.tsv", panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl", rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv", shet="work/genes/shet/2019/shet_weghorn_2019.tsv", @@ -20,11 +20,11 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file decipher_hi="work/genes/decipher/v3/decipher_hi_prediction.tsv.gz", output: rocksdb_identity=( - "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/" + "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/" "rocksdb/IDENTITY" ), spec_yaml=( - "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/" + "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/" "spec.yaml" ), wildcard_constraints: diff --git a/rules/work/genes/ordo.smk b/rules/work/genes/ordo.smk deleted file mode 100644 index 7b14cc8..0000000 --- a/rules/work/genes/ordo.smk +++ /dev/null @@ -1,34 +0,0 @@ -## Rules related to ORDO download - - -rule genes_ordo_download: # -- download ORDO CSV file - output: - csv=f"work/download/genes/ordo/{DV.ordo}/ordo.csv", - csv_md5=f"work/download/genes/ordo/{DV.ordo}/ordo.csv.md5", - shell: - r""" - wget --no-check-certificate \ - -O {output.csv} \ - 'https://data.bioontology.org/ontologies/ORDO/submissions/28/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv' - - md5sum {output.csv} > {output.csv_md5} - """ - -rule genes_ordo_convert: # -- postprocess file for HGNC gene IDs - input: - csv="work/download/genes/ordo/{version}/ordo.csv", - output: - tsv="work/genes/ordo/{version}/orpha_diseases.tsv", - tsv_md5="work/genes/ordo/{version}/orpha_diseases.tsv.md5", - shell: - """ - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" ERR EXIT - - python ./scripts/genes-orpha-diseases.py {input.csv} \ - | qsv sort -d '\t' \ - | qsv fmt -t '\t' \ - > {output.tsv} - - md5sum {output.tsv} > {output.tsv}.md5 - """ diff --git a/rules/work/genes/orphadata.smk b/rules/work/genes/orphadata.smk new file mode 100644 index 0000000..27af3ec --- /dev/null +++ b/rules/work/genes/orphadata.smk @@ -0,0 +1,21 @@ +## Rules related to ORDO download + + +rule genes_ordo_convert: # -- postprocess file for HGNC gene IDs + input: + xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv", + output: + tsv="work/genes/orphadata/{version}/orpha_diseases.tsv", + tsv_md5="work/genes/orphadata/{version}/orpha_diseases.tsv.md5", + shell: + """ + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" ERR EXIT + + python ./scripts/genes-orpha-diseases.py {input.xlink} \ + | qsv sort -d '\t' \ + | qsv fmt -t '\t' \ + > {output.tsv} + + md5sum {output.tsv} > {output.tsv}.md5 + """ diff --git a/scripts/genes-orpha-diseases.py b/scripts/genes-orpha-diseases.py index 151d520..394beaa 100755 --- a/scripts/genes-orpha-diseases.py +++ b/scripts/genes-orpha-diseases.py @@ -1,38 +1,112 @@ #!/usr/bin/env python -"""Helper script to extract gene-disease association from ORDO CSV.""" +"""Helper script to retrieve ORDO gene-disease associations from OrphaData.""" import csv -import json -import pathlib import sys +import httpx +import trio -def main(): - records = {} +#: URL for listing ORPHAcode. +URL_ORPHACODE_LIST = "https://api.orphadata.com/rd-associated-genes/orphacodes" +#: URL template for getting information on one ORPHAcode. +URL_ORPHACODE_GET = "https://api.orphadata.com/rd-cross-referencing/orphacodes/{}?lang=en" +#: URL template for getting enes on one ORPHAcode. +URL_ORPHACODE_GET_GENE = "https://api.orphadata.com/rd-associated-genes/orphacodes/{}" + + +async def main(): with open(sys.argv[1], "rt") as inputf: - reader = csv.DictReader(inputf, delimiter=",") - for record in reader: - records[record["gene_symbol"]] = record["hgnc_id"] - - print(f"# xlink entries: {len(records)}", file=sys.stderr) - - base_path = pathlib.Path(sys.argv[2]) - print("\t".join(["hgnc_id", "orpha_id", "disease_name"])) - for json_path in sorted(base_path.glob("*.json")): - with json_path.open("rt") as inputf: - data = json.load(inputf) - elem_top = data["Orphapacket"] - if elem_top.get("DisorderType", {}).get("value") != "Disease": - continue # skip categories - disease_name = elem_top["Label"] - orpha_id = elem_top["PURL"].replace("http://www.orpha.net/ORDO/Orphanet_", "ORPHA:") - elem_genes = elem_top.get("Genes", []) - for elem_gene in elem_genes: - gene_symbol = elem_gene["Gene"]["Symbol"] - hgnc_id = symbol_to_hgnc.get(gene_symbol) - if hgnc_id: # skip if no HGNC ID exists, maybe withdrawn? - print("\t".join(map(str, [hgnc_id, orpha_id, disease_name]))) + symbol_to_hgnc = { + row["gene_symbol"]: row["hgnc_id"] for row in csv.DictReader(inputf, delimiter="\t") + } + + writer = csv.DictWriter( + sys.stdout, + fieldnames=[ + "hgnc_id", + "orpha_id", + "assoc_status", + "omim_ids", + "disease_name", + "definition", + ], + delimiter="\t", + ) + writer.writeheader() + + async with httpx.AsyncClient() as client: + print("Fetching ORPHAcode list...", file=sys.stderr) + lst = await client.get(URL_ORPHACODE_LIST) + print("...done", file=sys.stderr) + disease_ids = {disease["ORPHAcode"] for disease in lst.json()["data"]["results"]} + + async def work(no: int, orpha_id: int, limiter: trio.CapacityLimiter): + async with limiter: + async with httpx.AsyncClient() as client: + try: + disease_infos = (await client.get(URL_ORPHACODE_GET.format(orpha_id))).json() + disease_genes = ( + await client.get(URL_ORPHACODE_GET_GENE.format(orpha_id)) + ).json() + except Exception as e: + print(f"Error fetching {orpha_id}: {e}", file=sys.stderr) + raise + finally: + if no % 100 == 0: + print( + f"done fetching ORPHAcode details {no}/{len(disease_ids)}", + file=sys.stderr, + ) + + disease_info_results = disease_infos["data"]["results"] + disease_name = disease_info_results["Preferred term"] + summary = None + if disease_info_results.get("SummaryInformation", []): + summary = disease_info_results["SummaryInformation"][0].get("Definition", None) + omim_ids = [] + for ref in disease_info_results.get("ExternalReference") or []: + if ref["Source"] == "OMIM": + omim_ids.append(f"OMIM:{ref['Reference']}") + + for association in disease_genes["data"]["results"]["DisorderGeneAssociation"]: + assoc_status = association["DisorderGeneAssociationStatus"] + if not association["Gene"]["ExternalReference"]: + symbol = association["Gene"]["Symbol"] + hgnc_id = symbol_to_hgnc.get(symbol, None) + if hgnc_id: + writer.writerow( + { + "hgnc_id": hgnc_id, + "orpha_id": f"ORPHA:{orpha_id}", + "assoc_status": assoc_status, + "disease_name": disease_name, + "definition": summary, + "omim_ids": "|".join(omim_ids), + } + ) + else: + for ref in association["Gene"]["ExternalReference"]: + if ref["Source"] == "HGNC": + hgnc_id = ref["Reference"] + writer.writerow( + { + "hgnc_id": f"HGNC:{hgnc_id}", + "orpha_id": f"ORPHA:{orpha_id}", + "assoc_status": assoc_status, + "disease_name": disease_name, + "definition": summary, + "omim_ids": "|".join(omim_ids), + } + ) + + print("Fetching ORPHAcode details...", file=sys.stderr) + limiter = trio.CapacityLimiter(10) + async with trio.open_nursery() as nursery: + for no, disease_id in enumerate(disease_ids): + nursery.start_soon(work, no, disease_id, limiter) + print("...done", file=sys.stderr) if __name__ == "__main__": - main() + trio.run(main) diff --git a/varfish_db_downloader/versions.py b/varfish_db_downloader/versions.py index fc0d7f5..2615c83 100644 --- a/varfish_db_downloader/versions.py +++ b/varfish_db_downloader/versions.py @@ -102,8 +102,8 @@ class DataVersions: acmg_sf: str #: HPO hpo: str - #: ORDO - ordo: str + #: Orphadata + orphadata: str #: Pathogenic MMS patho_mms: str #: Mehari transcript data. @@ -162,7 +162,7 @@ class DataVersions: dbsnp="b151", acmg_sf="3.1", hpo="20230606", - ordo="4.4", + orphadata=TODAY, patho_mms="20220730", mehari_tx="0.4.4", clinvar_release=CLINVAR_RELEASE,