Skip to content

Commit

Permalink
feat: replace OrphaPacket by ORDO CSV (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Jan 16, 2024
1 parent 06c800c commit 988524d
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 56 deletions.
8 changes: 4 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ rule all:
#
# genes
f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
f"work/download/genes/orphapacket/{DV.orphapacket}/orphapacket.tar.gz",
f"work/download/genes/ordo/{DV.ordo}/ordo.csv",
"work/download/genes/alphamissense/1/AlphaMissense_gene_hg38.tsv.gz",
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
"work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
Expand All @@ -96,7 +96,7 @@ rule all:
f"work/genes/gnomad/{DV.gnomad_constraints}/gnomad_constraints.tsv",
f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl",
f"work/genes/omim/{DV.hpo}+{DV.today}/omim_diseases.tsv",
f"work/genes/orphapacket/{DV.orphapacket}+{DV.today}/orpha_diseases.tsv",
f"work/genes/ordo/{DV.ordo}/orpha_diseases.tsv",
"work/genes/rcnv/2022/rcnv_collins_2022.tsv",
"work/genes/shet/2019/shet_weghorn_2019.tsv",
f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv",
Expand Down Expand Up @@ -177,7 +177,7 @@ rule all:
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
# ----- genes
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.orphapacket}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.ordo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
# -- worker data
f"output/full/worker/genes-regions-grch37-{DV.refseq_37}+{PV.worker}/refseq_genes.bin",
f"output/full/worker/genes-regions-grch37-{DV.ensembl_37}+{PV.worker}/ensembl_genes.bin",
Expand Down Expand Up @@ -350,7 +350,7 @@ include: "rules/work/genes/mehari_data_tx.smk"
include: "rules/work/genes/ncbi.smk"
include: "rules/work/genes/omim.smk"
include: "rules/work/genes/panelapp.smk"
include: "rules/work/genes/orphapacket.smk"
include: "rules/work/genes/ordo.smk"
include: "rules/work/genes/rcnv.smk"
include: "rules/work/genes/shet.smk"
include: "rules/work/genes/domino.smk"
Expand Down
2 changes: 1 addition & 1 deletion download_urls.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
strategy: manual
count: null

- url: https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz
- url: https://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv
excerpt_strategy:
strategy: no-excerpt
count: null
Expand Down
3 changes: 0 additions & 3 deletions excerpt-data/6045c008f1c0f370/url.txt

This file was deleted.

3 changes: 0 additions & 3 deletions excerpt-data/6045c008f1c0f370/v10.1.tar.gz

This file was deleted.

4 changes: 2 additions & 2 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl",
ncbi="work/genes/entrez/{date}/gene_info.jsonl",
omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
orpha="work/genes/ordo/{v_orpha}/orpha_diseases.tsv",
panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl",
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
Expand Down Expand Up @@ -65,6 +65,6 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
\
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader} \
--value v_orphapacket={wildcards.v_orpha} \
--value v_ordo={wildcards.v_orpha} \
> {output.spec_yaml}
"""
2 changes: 1 addition & 1 deletion rules/output/annonars/genes.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ x-created-from:
- name: OMIM
version: {{ today }}
- name: ORDO
version: {{ v_orphapacket }}
version: {{ v_ordo }}
- name: rCNV pHaplo/pTriplo scores
version: 2022-Collins-et-al
- name: sHet scores
Expand Down
34 changes: 34 additions & 0 deletions rules/work/genes/ordo.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
## Rules related to ORDO download


rule genes_ordo_download: # -- download ORDO CSV file
output:
csv=f"work/download/genes/ordo/{DV.ordo}/ordo.csv",
csv_md5=f"work/download/genes/ordo/{DV.ordo}/ordo.csv.md5",
shell:
r"""
wget --no-check-certificate \
-O {output.csv} \
'https://data.bioontology.org/ontologies/ORDO/submissions/28/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv'
md5sum {output.csv} > {output.csv_md5}
"""

rule genes_ordo_convert: # -- postprocess file for HGNC gene IDs
input:
csv="work/download/genes/ordo/{version}/ordo.csv",
output:
tsv="work/genes/ordo/{version}/orpha_diseases.tsv",
tsv_md5="work/genes/ordo/{version}/orpha_diseases.tsv.md5",
shell:
"""
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" ERR EXIT
python ./scripts/genes-orpha-diseases.py {input.csv} \
| qsv sort -d '\t' \
| qsv fmt -t '\t' \
> {output.tsv}
md5sum {output.tsv} > {output.tsv}.md5
"""
34 changes: 0 additions & 34 deletions rules/work/genes/orphapacket.smk

This file was deleted.

10 changes: 5 additions & 5 deletions scripts/genes-orpha-diseases.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
"""Helper script to extract gene-disease association from orphapacket."""
"""Helper script to extract gene-disease association from ORDO CSV."""

import csv
import json
Expand All @@ -8,13 +8,13 @@


def main():
symbol_to_hgnc = {}
records = {}
with open(sys.argv[1], "rt") as inputf:
reader = csv.DictReader(inputf, delimiter="\t")
reader = csv.DictReader(inputf, delimiter=",")
for record in reader:
symbol_to_hgnc[record["gene_symbol"]] = record["hgnc_id"]
records[record["gene_symbol"]] = record["hgnc_id"]

print(f"# xlink entries: {len(symbol_to_hgnc)}", file=sys.stderr)
print(f"# xlink entries: {len(records)}", file=sys.stderr)

base_path = pathlib.Path(sys.argv[2])
print("\t".join(["hgnc_id", "orpha_id", "disease_name"]))
Expand Down
6 changes: 3 additions & 3 deletions varfish_db_downloader/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ class DataVersions:
acmg_sf: str
#: HPO
hpo: str
#: OrphaPacket
orphapacket: str
#: ORDO
ordo: str
#: Pathogenic MMS
patho_mms: str
#: Mehari transcript data.
Expand Down Expand Up @@ -162,7 +162,7 @@ class DataVersions:
dbsnp="b151",
acmg_sf="3.1",
hpo="20230606",
orphapacket="10.1",
ordo="4.4",
patho_mms="20220730",
mehari_tx="0.4.4",
clinvar_release=CLINVAR_RELEASE,
Expand Down

0 comments on commit 988524d

Please sign in to comment.