-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: replace orphapacket by orphadata API access (#84)
- Loading branch information
Showing
9 changed files
with
142 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Git LFS file not shown
Git LFS file not shown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
## Rules related to ORDO download | ||
|
||
|
||
rule genes_ordo_convert: # -- postprocess file for HGNC gene IDs | ||
input: | ||
xlink=f"output/full/mehari/genes-xlink-{DV.today}/genes-xlink.tsv", | ||
output: | ||
tsv="work/genes/orphadata/{version}/orpha_diseases.tsv", | ||
tsv_md5="work/genes/orphadata/{version}/orpha_diseases.tsv.md5", | ||
shell: | ||
""" | ||
export TMPDIR=$(mktemp -d) | ||
trap "rm -rf $TMPDIR" ERR EXIT | ||
python ./scripts/genes-orpha-diseases.py {input.xlink} \ | ||
| qsv sort -d '\t' \ | ||
| qsv fmt -t '\t' \ | ||
> {output.tsv} | ||
md5sum {output.tsv} > {output.tsv}.md5 | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,112 @@ | ||
#!/usr/bin/env python | ||
"""Helper script to extract gene-disease association from ORDO CSV.""" | ||
"""Helper script to retrieve ORDO gene-disease associations from OrphaData.""" | ||
|
||
import csv | ||
import json | ||
import pathlib | ||
import sys | ||
|
||
import httpx | ||
import trio | ||
|
||
def main(): | ||
records = {} | ||
#: URL for listing ORPHAcode. | ||
URL_ORPHACODE_LIST = "https://api.orphadata.com/rd-associated-genes/orphacodes" | ||
#: URL template for getting information on one ORPHAcode. | ||
URL_ORPHACODE_GET = "https://api.orphadata.com/rd-cross-referencing/orphacodes/{}?lang=en" | ||
#: URL template for getting enes on one ORPHAcode. | ||
URL_ORPHACODE_GET_GENE = "https://api.orphadata.com/rd-associated-genes/orphacodes/{}" | ||
|
||
|
||
async def main(): | ||
with open(sys.argv[1], "rt") as inputf: | ||
reader = csv.DictReader(inputf, delimiter=",") | ||
for record in reader: | ||
records[record["gene_symbol"]] = record["hgnc_id"] | ||
|
||
print(f"# xlink entries: {len(records)}", file=sys.stderr) | ||
|
||
base_path = pathlib.Path(sys.argv[2]) | ||
print("\t".join(["hgnc_id", "orpha_id", "disease_name"])) | ||
for json_path in sorted(base_path.glob("*.json")): | ||
with json_path.open("rt") as inputf: | ||
data = json.load(inputf) | ||
elem_top = data["Orphapacket"] | ||
if elem_top.get("DisorderType", {}).get("value") != "Disease": | ||
continue # skip categories | ||
disease_name = elem_top["Label"] | ||
orpha_id = elem_top["PURL"].replace("http://www.orpha.net/ORDO/Orphanet_", "ORPHA:") | ||
elem_genes = elem_top.get("Genes", []) | ||
for elem_gene in elem_genes: | ||
gene_symbol = elem_gene["Gene"]["Symbol"] | ||
hgnc_id = symbol_to_hgnc.get(gene_symbol) | ||
if hgnc_id: # skip if no HGNC ID exists, maybe withdrawn? | ||
print("\t".join(map(str, [hgnc_id, orpha_id, disease_name]))) | ||
symbol_to_hgnc = { | ||
row["gene_symbol"]: row["hgnc_id"] for row in csv.DictReader(inputf, delimiter="\t") | ||
} | ||
|
||
writer = csv.DictWriter( | ||
sys.stdout, | ||
fieldnames=[ | ||
"hgnc_id", | ||
"orpha_id", | ||
"assoc_status", | ||
"omim_ids", | ||
"disease_name", | ||
"definition", | ||
], | ||
delimiter="\t", | ||
) | ||
writer.writeheader() | ||
|
||
async with httpx.AsyncClient() as client: | ||
print("Fetching ORPHAcode list...", file=sys.stderr) | ||
lst = await client.get(URL_ORPHACODE_LIST) | ||
print("...done", file=sys.stderr) | ||
disease_ids = {disease["ORPHAcode"] for disease in lst.json()["data"]["results"]} | ||
|
||
async def work(no: int, orpha_id: int, limiter: trio.CapacityLimiter): | ||
async with limiter: | ||
async with httpx.AsyncClient() as client: | ||
try: | ||
disease_infos = (await client.get(URL_ORPHACODE_GET.format(orpha_id))).json() | ||
disease_genes = ( | ||
await client.get(URL_ORPHACODE_GET_GENE.format(orpha_id)) | ||
).json() | ||
except Exception as e: | ||
print(f"Error fetching {orpha_id}: {e}", file=sys.stderr) | ||
raise | ||
finally: | ||
if no % 100 == 0: | ||
print( | ||
f"done fetching ORPHAcode details {no}/{len(disease_ids)}", | ||
file=sys.stderr, | ||
) | ||
|
||
disease_info_results = disease_infos["data"]["results"] | ||
disease_name = disease_info_results["Preferred term"] | ||
summary = None | ||
if disease_info_results.get("SummaryInformation", []): | ||
summary = disease_info_results["SummaryInformation"][0].get("Definition", None) | ||
omim_ids = [] | ||
for ref in disease_info_results.get("ExternalReference") or []: | ||
if ref["Source"] == "OMIM": | ||
omim_ids.append(f"OMIM:{ref['Reference']}") | ||
|
||
for association in disease_genes["data"]["results"]["DisorderGeneAssociation"]: | ||
assoc_status = association["DisorderGeneAssociationStatus"] | ||
if not association["Gene"]["ExternalReference"]: | ||
symbol = association["Gene"]["Symbol"] | ||
hgnc_id = symbol_to_hgnc.get(symbol, None) | ||
if hgnc_id: | ||
writer.writerow( | ||
{ | ||
"hgnc_id": hgnc_id, | ||
"orpha_id": f"ORPHA:{orpha_id}", | ||
"assoc_status": assoc_status, | ||
"disease_name": disease_name, | ||
"definition": summary, | ||
"omim_ids": "|".join(omim_ids), | ||
} | ||
) | ||
else: | ||
for ref in association["Gene"]["ExternalReference"]: | ||
if ref["Source"] == "HGNC": | ||
hgnc_id = ref["Reference"] | ||
writer.writerow( | ||
{ | ||
"hgnc_id": f"HGNC:{hgnc_id}", | ||
"orpha_id": f"ORPHA:{orpha_id}", | ||
"assoc_status": assoc_status, | ||
"disease_name": disease_name, | ||
"definition": summary, | ||
"omim_ids": "|".join(omim_ids), | ||
} | ||
) | ||
|
||
print("Fetching ORPHAcode details...", file=sys.stderr) | ||
limiter = trio.CapacityLimiter(10) | ||
async with trio.open_nursery() as nursery: | ||
for no, disease_id in enumerate(disease_ids): | ||
nursery.start_soon(work, no, disease_id, limiter) | ||
print("...done", file=sys.stderr) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
trio.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters