From 482bdd0f4982172d1548d571a6b9f60e4da0ef29 Mon Sep 17 00:00:00 2001
From: Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de>
Date: Fri, 19 Jan 2024 12:38:32 +0100
Subject: [PATCH] feat: replace orphapacket by orphadata API access (#84) (#85)

---
 .gitignore                                    |    3 +
 Snakefile                                     |   15 +-
 download_urls.yml                             |   55 +-
 environment.yml                               |    8 +-
 excerpt-data/111d8c6e08038f62/20              |    3 +
 excerpt-data/111d8c6e08038f62/url.txt         |    3 +
 .../1963f3c58ea066be/omim_unmapped_terms.tsv  |    3 +
 excerpt-data/1963f3c58ea066be/url.txt         |    3 +
 excerpt-data/32c97f6adaf88f01/hp.obo          |    3 +
 excerpt-data/32c97f6adaf88f01/url.txt         |    3 +
 .../366878c8178827e3/phenotype_to_genes.txt   |    3 -
 excerpt-data/366878c8178827e3/url.txt         |    3 -
 excerpt-data/41961c7350780224/phenotype.hpoa  |    3 +
 excerpt-data/41961c7350780224/url.txt         |    3 +
 excerpt-data/5dfdf97f46a7c299/hp.obo          |    3 -
 excerpt-data/5dfdf97f46a7c299/url.txt         |    3 -
 excerpt-data/6045c008f1c0f370/url.txt         |    3 -
 excerpt-data/6045c008f1c0f370/v10.1.tar.gz    |    3 -
 excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv    |    3 +
 excerpt-data/615312ce3f5fc1bf/url.txt         |    3 +
 .../617bebe58c82f24e/CTD_diseases.tsv.gz      |    3 +
 excerpt-data/617bebe58c82f24e/url.txt         |    3 +
 excerpt-data/652646c24140df2a/mondo.obo       |    3 +
 excerpt-data/652646c24140df2a/url.txt         |    3 +
 excerpt-data/6f378db589a4bbb9/orphacodes      |    3 +
 excerpt-data/6f378db589a4bbb9/url.txt         |    3 +
 excerpt-data/91f964d1aa8367a5/20              |    3 +
 excerpt-data/91f964d1aa8367a5/url.txt         |    3 +
 .../97187ff23d7d2773/phenotype_to_genes.txt   |    3 +
 excerpt-data/97187ff23d7d2773/url.txt         |    3 +
 excerpt-data/__orphadata__/orphadata.jsonl    |    3 +
 excerpt-data/a0f9f11118d32143/download        |    3 +
 excerpt-data/a0f9f11118d32143/url.txt         |    3 +
 .../ac92d419f271d95f/genes_to_phenotype.txt   |    3 +
 excerpt-data/ac92d419f271d95f/url.txt         |    3 +
 excerpt-data/afba4597f963e400/phenotype.hpoa  |    3 -
 excerpt-data/afba4597f963e400/url.txt         |    3 -
 .../e3f85776a4d3a44b/genes_to_phenotype.txt   |    3 -
 excerpt-data/e3f85776a4d3a44b/url.txt         |    3 -
 excerpt-data/e4097d7046a53998/omim_import.obo |    3 +
 excerpt-data/e4097d7046a53998/url.txt         |    3 +
 .../ede624c2cd588939/omim-unmapped.csv        |    3 +
 excerpt-data/ede624c2cd588939/url.txt         |    3 +
 rules/output/annonars/genes.smk               |   16 +-
 rules/output/annonars/genes.spec.yaml         |    2 +-
 rules/output/worker/hgnc.smk                  |    5 +
 rules/work/genes/conditions.smk               |   39 +
 rules/work/genes/ctd.smk                      |   15 +
 rules/work/genes/decipher.smk                 |   25 +-
 rules/work/genes/do.smk                       |   24 +
 rules/work/genes/mondo.smk                    |   14 +
 rules/work/genes/omim.smk                     |    7 +-
 rules/work/genes/orphadata.smk                |   23 +
 rules/work/genes/orphapacket.smk              |   34 -
 scripts/genes-integrate-diseases.py           | 1321 +++++++++++++++++
 scripts/genes-orpha-diseases.py               |   92 +-
 scripts/genes-xlink-hgnc.jq                   |    5 +-
 varfish_db_downloader/versions.py             |    8 +-
 58 files changed, 1699 insertions(+), 129 deletions(-)
 create mode 100644 excerpt-data/111d8c6e08038f62/20
 create mode 100644 excerpt-data/111d8c6e08038f62/url.txt
 create mode 100644 excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv
 create mode 100644 excerpt-data/1963f3c58ea066be/url.txt
 create mode 100644 excerpt-data/32c97f6adaf88f01/hp.obo
 create mode 100644 excerpt-data/32c97f6adaf88f01/url.txt
 delete mode 100644 excerpt-data/366878c8178827e3/phenotype_to_genes.txt
 delete mode 100644 excerpt-data/366878c8178827e3/url.txt
 create mode 100644 excerpt-data/41961c7350780224/phenotype.hpoa
 create mode 100644 excerpt-data/41961c7350780224/url.txt
 delete mode 100644 excerpt-data/5dfdf97f46a7c299/hp.obo
 delete mode 100644 excerpt-data/5dfdf97f46a7c299/url.txt
 delete mode 100644 excerpt-data/6045c008f1c0f370/url.txt
 delete mode 100644 excerpt-data/6045c008f1c0f370/v10.1.tar.gz
 create mode 100644 excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv
 create mode 100644 excerpt-data/615312ce3f5fc1bf/url.txt
 create mode 100644 excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz
 create mode 100644 excerpt-data/617bebe58c82f24e/url.txt
 create mode 100644 excerpt-data/652646c24140df2a/mondo.obo
 create mode 100644 excerpt-data/652646c24140df2a/url.txt
 create mode 100644 excerpt-data/6f378db589a4bbb9/orphacodes
 create mode 100644 excerpt-data/6f378db589a4bbb9/url.txt
 create mode 100644 excerpt-data/91f964d1aa8367a5/20
 create mode 100644 excerpt-data/91f964d1aa8367a5/url.txt
 create mode 100644 excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt
 create mode 100644 excerpt-data/97187ff23d7d2773/url.txt
 create mode 100644 excerpt-data/__orphadata__/orphadata.jsonl
 create mode 100644 excerpt-data/a0f9f11118d32143/download
 create mode 100644 excerpt-data/a0f9f11118d32143/url.txt
 create mode 100644 excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt
 create mode 100644 excerpt-data/ac92d419f271d95f/url.txt
 delete mode 100644 excerpt-data/afba4597f963e400/phenotype.hpoa
 delete mode 100644 excerpt-data/afba4597f963e400/url.txt
 delete mode 100644 excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt
 delete mode 100644 excerpt-data/e3f85776a4d3a44b/url.txt
 create mode 100644 excerpt-data/e4097d7046a53998/omim_import.obo
 create mode 100644 excerpt-data/e4097d7046a53998/url.txt
 create mode 100644 excerpt-data/ede624c2cd588939/omim-unmapped.csv
 create mode 100644 excerpt-data/ede624c2cd588939/url.txt
 create mode 100644 rules/work/genes/conditions.smk
 create mode 100644 rules/work/genes/ctd.smk
 create mode 100644 rules/work/genes/do.smk
 create mode 100644 rules/work/genes/mondo.smk
 create mode 100644 rules/work/genes/orphadata.smk
 delete mode 100644 rules/work/genes/orphapacket.smk
 create mode 100644 scripts/genes-integrate-diseases.py

diff --git a/.gitignore b/.gitignore
index d306f04..982b6d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Ignore all pickled data
+*.pickle*
+
 # Ignore the workflow directories.
 /work/
 /output/
diff --git a/Snakefile b/Snakefile
index 43eacca..9c8a888 100644
--- a/Snakefile
+++ b/Snakefile
@@ -86,8 +86,9 @@ rule all:
         #
         # genes
         f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
-        f"work/download/genes/orphapacket/{DV.orphapacket}/orphapacket.tar.gz",
         "work/download/genes/alphamissense/1/AlphaMissense_gene_hg38.tsv.gz",
+        f"work/download/genes/ctd/{DV.today}/CTD_diseases.tsv.gz",
+        f"work/download/do/{DV.today}/omim-unmapped.csv",
         f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
         "work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
         f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv",
@@ -96,7 +97,8 @@ rule all:
         f"work/genes/gnomad/{DV.gnomad_constraints}/gnomad_constraints.tsv",
         f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl",
         f"work/genes/omim/{DV.hpo}+{DV.today}/omim_diseases.tsv",
-        f"work/genes/orphapacket/{DV.orphapacket}+{DV.today}/orpha_diseases.tsv",
+        f"work/genes/orphadata/{DV.orphadata}/orphadata.jsonl",
+        f"work/genes/mondo/{DV.today}/mondo.obo",
         "work/genes/rcnv/2022/rcnv_collins_2022.tsv",
         "work/genes/shet/2019/shet_weghorn_2019.tsv",
         f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv",
@@ -177,7 +179,7 @@ rule all:
         f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
         f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
         # ----- genes
-        f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.orphapacket}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
+        f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
         # -- worker data
         f"output/full/worker/genes-regions-grch37-{DV.refseq_37}+{PV.worker}/refseq_genes.bin",
         f"output/full/worker/genes-regions-grch37-{DV.ensembl_37}+{PV.worker}/ensembl_genes.bin",
@@ -341,6 +343,9 @@ include: "rules/work/misc/hpo.smk"
 include: "rules/work/genes/alphamissense.smk"
 include: "rules/work/genes/dbnsfp.smk"
 include: "rules/work/genes/clingen.smk"
+include: "rules/work/genes/conditions.smk"
+include: "rules/work/genes/ctd.smk"
+include: "rules/work/genes/do.smk"
 include: "rules/work/genes/decipher.smk"
 include: "rules/work/genes/ensembl.smk"
 include: "rules/work/genes/gnomad.smk"
@@ -350,11 +355,11 @@ include: "rules/work/genes/mehari_data_tx.smk"
 include: "rules/work/genes/ncbi.smk"
 include: "rules/work/genes/omim.smk"
 include: "rules/work/genes/panelapp.smk"
-include: "rules/work/genes/orphapacket.smk"
+include: "rules/work/genes/mondo.smk"
+include: "rules/work/genes/orphadata.smk"
 include: "rules/work/genes/rcnv.smk"
 include: "rules/work/genes/shet.smk"
 include: "rules/work/genes/domino.smk"
-include: "rules/work/genes/clingen.smk"
 # Reference sequence--related rules.
 include: "rules/work/reference/human.smk"
 # Features (position and not variant specific).
diff --git a/download_urls.yml b/download_urls.yml
index a4f0af8..4b3ecea 100644
--- a/download_urls.yml
+++ b/download_urls.yml
@@ -1,3 +1,48 @@
+# Note that this just tests the availability of the OrphaData API.  We have a file in
+# excerpts/__orphadata__ that is used by `genes-orpha-diseases.py` in CI=true mode.
+- url: https://api.orphadata.com/rd-cross-referencing/orphacodes
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+- url: https://api.orphadata.com/rd-cross-referencing/orphacodes/20?lang=en
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+- url: https://api.orphadata.com/rd-associated-genes/orphacodes/20
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: https://raw.githubusercontent.com/monarch-initiative/mondo-ingest/main/src/ontology/reports/omim_unmapped_terms.tsv
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/DO_NON_Production_Files/omim_import.obo
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/DOreports/OMIMinDO.tsv
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/reports/omim-unmapped.csv
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: https://ctdbase.org/reports/CTD_diseases.tsv.gz
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
+- url: http://purl.obolibrary.org/obo/mondo.obo
+  excerpt_strategy:
+    strategy: no-excerpt
+    count: null
+
 - url: https://panelapp.genomicsengland.co.uk/api/v1/entities/
   excerpt_strategy:
     strategy: no-excerpt
@@ -101,7 +146,7 @@
     strategy: manual
     count: null
 
-- url: https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz
+- url: https://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv
   excerpt_strategy:
     strategy: no-excerpt
     count: null
@@ -130,19 +175,19 @@
     strategy: no-excerpt
     count: null
 
-- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/hp.obo
+- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/hp.obo
   excerpt_strategy:
     strategy: no-excerpt
     count: null
-- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/phenotype.hpoa
+- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/phenotype.hpoa
   excerpt_strategy:
     strategy: no-excerpt
     count: null
-- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/phenotype_to_genes.txt
+- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/phenotype_to_genes.txt
   excerpt_strategy:
     strategy: no-excerpt
     count: null
-- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/genes_to_phenotype.txt
+- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/genes_to_phenotype.txt
   excerpt_strategy:
     strategy: no-excerpt
     count: null
diff --git a/environment.yml b/environment.yml
index 1f42695..5e552bb 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,6 +12,8 @@ dependencies:
   - click
   - loguru
   - numpy
+  - pydantic
+  - pronto >=2.5,<3.0
   - pyyaml
   - requests
   - requests-ftp
@@ -41,9 +43,13 @@ dependencies:
   # Parallel (de)compression.
   - pigz
   # Varfish related
-  - annonars =0.33.0
+  - annonars =0.34.0
   - viguno =0.2.0
   - mehari =0.21.1
   - varfish-server-worker =0.10.2
   # S3 uploads
   - s5cmd =2.1.0
+  # async HTTP requests
+  - httpx =0.25.0
+  - httpcore =0.18.0
+  - trio
diff --git a/excerpt-data/111d8c6e08038f62/20 b/excerpt-data/111d8c6e08038f62/20
new file mode 100644
index 0000000..862d894
--- /dev/null
+++ b/excerpt-data/111d8c6e08038f62/20
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b610d631a11a2f6f4b93b8f7f7458447fa1a974ade2a18b1b9c9b5047dac516c
+size 3338
diff --git a/excerpt-data/111d8c6e08038f62/url.txt b/excerpt-data/111d8c6e08038f62/url.txt
new file mode 100644
index 0000000..2a496ff
--- /dev/null
+++ b/excerpt-data/111d8c6e08038f62/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bda92b632aaeacd032c57d87e6420ccac3945bcd78bf51bafc117da4eb63a601
+size 69
diff --git a/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv b/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv
new file mode 100644
index 0000000..94d4f19
--- /dev/null
+++ b/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:384eccb1d84e8a5036027cd340d2b4b18ce41170de28407f3022d7f3dd2392dc
+size 4779
diff --git a/excerpt-data/1963f3c58ea066be/url.txt b/excerpt-data/1963f3c58ea066be/url.txt
new file mode 100644
index 0000000..af13e07
--- /dev/null
+++ b/excerpt-data/1963f3c58ea066be/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2153c0f0281a0399c77b0964d96c1286f6716eac34304f68f4e838fe41a02bdd
+size 116
diff --git a/excerpt-data/32c97f6adaf88f01/hp.obo b/excerpt-data/32c97f6adaf88f01/hp.obo
new file mode 100644
index 0000000..3e8f702
--- /dev/null
+++ b/excerpt-data/32c97f6adaf88f01/hp.obo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:508bb0326603671327e9f4af05beffd1800e053557c7c1a0d137fd1f458bb0f7
+size 9719364
diff --git a/excerpt-data/32c97f6adaf88f01/url.txt b/excerpt-data/32c97f6adaf88f01/url.txt
new file mode 100644
index 0000000..0f60dc3
--- /dev/null
+++ b/excerpt-data/32c97f6adaf88f01/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b057fd342e531569cd4c2af122986569a44bc55348bcbc8104866b9ea696acfb
+size 94
diff --git a/excerpt-data/366878c8178827e3/phenotype_to_genes.txt b/excerpt-data/366878c8178827e3/phenotype_to_genes.txt
deleted file mode 100644
index 4308961..0000000
--- a/excerpt-data/366878c8178827e3/phenotype_to_genes.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b6d9749cfe77403a5133dd86ef2419eae75ceb86b4ac7df1af2872e635d4e77a
-size 58253258
diff --git a/excerpt-data/366878c8178827e3/url.txt b/excerpt-data/366878c8178827e3/url.txt
deleted file mode 100644
index 6e90764..0000000
--- a/excerpt-data/366878c8178827e3/url.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9894daafe63bf29b0462ecfe2937aefb11387dd3fbf2070ed1e53b0e37cc4fb9
-size 110
diff --git a/excerpt-data/41961c7350780224/phenotype.hpoa b/excerpt-data/41961c7350780224/phenotype.hpoa
new file mode 100644
index 0000000..ed13380
--- /dev/null
+++ b/excerpt-data/41961c7350780224/phenotype.hpoa
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f10bb697f53d1767335f8f7390dc85ec659d78b946abd1a31decd2058b8ffe9
+size 31795534
diff --git a/excerpt-data/41961c7350780224/url.txt b/excerpt-data/41961c7350780224/url.txt
new file mode 100644
index 0000000..2a81f51
--- /dev/null
+++ b/excerpt-data/41961c7350780224/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40157925175644167a768330abaa318c6cc3e961d0463e8a3ee45b77513d4dd4
+size 102
diff --git a/excerpt-data/5dfdf97f46a7c299/hp.obo b/excerpt-data/5dfdf97f46a7c299/hp.obo
deleted file mode 100644
index d9da674..0000000
--- a/excerpt-data/5dfdf97f46a7c299/hp.obo
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6fa2f31ade75580469a996be3f2ebbcd3812b6ae75b3b02820a201bb1c4736c9
-size 9222726
diff --git a/excerpt-data/5dfdf97f46a7c299/url.txt b/excerpt-data/5dfdf97f46a7c299/url.txt
deleted file mode 100644
index 17e6803..0000000
--- a/excerpt-data/5dfdf97f46a7c299/url.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2da8d523a44e41522c6bd46943f8cd36d4e73c22388846174372a412cf59f495
-size 94
diff --git a/excerpt-data/6045c008f1c0f370/url.txt b/excerpt-data/6045c008f1c0f370/url.txt
deleted file mode 100644
index 17796bd..0000000
--- a/excerpt-data/6045c008f1c0f370/url.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:58b5f890e962de336ac264c788ae9136ad1eb69f52c1f536379c2984b85beef1
-size 71
diff --git a/excerpt-data/6045c008f1c0f370/v10.1.tar.gz b/excerpt-data/6045c008f1c0f370/v10.1.tar.gz
deleted file mode 100644
index 4779579..0000000
--- a/excerpt-data/6045c008f1c0f370/v10.1.tar.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38de2709b2f9a9c4e0b9ec18c5ebae337a47e97897808069dcee7ba5c39d3224
-size 1503498
diff --git a/excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv b/excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv
new file mode 100644
index 0000000..08eb9cd
--- /dev/null
+++ b/excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e82d6abe489be419dd0bebcd8e3da518e8fd6b6bf4cfd1a2670df8bdc7a81d71
+size 355299
diff --git a/excerpt-data/615312ce3f5fc1bf/url.txt b/excerpt-data/615312ce3f5fc1bf/url.txt
new file mode 100644
index 0000000..b3b97a9
--- /dev/null
+++ b/excerpt-data/615312ce3f5fc1bf/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e5bc9bbc38d8ec6e3514ee71e383ce3a355dcf77b49682cf1d46bffe86bd15f
+size 99
diff --git a/excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz b/excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz
new file mode 100644
index 0000000..d879265
--- /dev/null
+++ b/excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e37e0f2cae2f4e6ca6f26ab47508c0ed461b44870fbb7b69cecaf0a9e5c5db1d
+size 1778371
diff --git a/excerpt-data/617bebe58c82f24e/url.txt b/excerpt-data/617bebe58c82f24e/url.txt
new file mode 100644
index 0000000..9c4a699
--- /dev/null
+++ b/excerpt-data/617bebe58c82f24e/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93d4d842332879379d1bfffb8f9238d62a95a0b59d8ae0063195cc62c62ad940
+size 48
diff --git a/excerpt-data/652646c24140df2a/mondo.obo b/excerpt-data/652646c24140df2a/mondo.obo
new file mode 100644
index 0000000..9c120a6
--- /dev/null
+++ b/excerpt-data/652646c24140df2a/mondo.obo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ce8ebbfdb594fe32335e6c26c6f10d0c085da5f47e74bf4fbef2a15690b6fe8
+size 33100807
diff --git a/excerpt-data/652646c24140df2a/url.txt b/excerpt-data/652646c24140df2a/url.txt
new file mode 100644
index 0000000..82d4fa2
--- /dev/null
+++ b/excerpt-data/652646c24140df2a/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a0983bc3cc220a3d3709f27ecd4524035666eb95fe984d61276b0dcb275f882
+size 41
diff --git a/excerpt-data/6f378db589a4bbb9/orphacodes b/excerpt-data/6f378db589a4bbb9/orphacodes
new file mode 100644
index 0000000..f3dac85
--- /dev/null
+++ b/excerpt-data/6f378db589a4bbb9/orphacodes
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:848ee5fb05bf6c5431aff29eab7ad1bacbc5cbc984cdecba4f5672a30a9ac1e7
+size 1242966
diff --git a/excerpt-data/6f378db589a4bbb9/url.txt b/excerpt-data/6f378db589a4bbb9/url.txt
new file mode 100644
index 0000000..228bcdc
--- /dev/null
+++ b/excerpt-data/6f378db589a4bbb9/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd4412fd56abffcedb9d340f8ac92cdee271bcc5c3e535418384c286675ff0b4
+size 58
diff --git a/excerpt-data/91f964d1aa8367a5/20 b/excerpt-data/91f964d1aa8367a5/20
new file mode 100644
index 0000000..fc06abb
--- /dev/null
+++ b/excerpt-data/91f964d1aa8367a5/20
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec854b1c8f931b18f156c340a1a8ec46509f9207f81c2a04b0b3a519d9b84a87
+size 2191
diff --git a/excerpt-data/91f964d1aa8367a5/url.txt b/excerpt-data/91f964d1aa8367a5/url.txt
new file mode 100644
index 0000000..8cfa63d
--- /dev/null
+++ b/excerpt-data/91f964d1aa8367a5/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5530926f47d8db160ef199d1836317edc614b80fb5138893cfc93ce349f19f90
+size 60
diff --git a/excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt b/excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt
new file mode 100644
index 0000000..90d9d53
--- /dev/null
+++ b/excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38caf0ffede297f84dc2d41a238828683957f5baa72a6cc226661865d81c562d
+size 60158078
diff --git a/excerpt-data/97187ff23d7d2773/url.txt b/excerpt-data/97187ff23d7d2773/url.txt
new file mode 100644
index 0000000..3e40490
--- /dev/null
+++ b/excerpt-data/97187ff23d7d2773/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bed40c33b60c6ba43a4c15d7184aab42bce54bd1d88f29a4b8fa754add577d4c
+size 110
diff --git a/excerpt-data/__orphadata__/orphadata.jsonl b/excerpt-data/__orphadata__/orphadata.jsonl
new file mode 100644
index 0000000..eebd9c2
--- /dev/null
+++ b/excerpt-data/__orphadata__/orphadata.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71c7f9a3df098e467957e4dcbcc8bf78b5713b11e9e3060ea0f67437dca0cb71
+size 25839170
diff --git a/excerpt-data/a0f9f11118d32143/download b/excerpt-data/a0f9f11118d32143/download
new file mode 100644
index 0000000..759232c
--- /dev/null
+++ b/excerpt-data/a0f9f11118d32143/download
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4f7b8049f2238d04b0453d2422d7499fd7196162027b7a3b4f2f91ca9b16a2
+size 2509937
diff --git a/excerpt-data/a0f9f11118d32143/url.txt b/excerpt-data/a0f9f11118d32143/url.txt
new file mode 100644
index 0000000..b8b04dd
--- /dev/null
+++ b/excerpt-data/a0f9f11118d32143/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa28fe18f2378e43812405874df6a303db980507aea705278cbc047d03b3d38c
+size 118
diff --git a/excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt b/excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt
new file mode 100644
index 0000000..797ff46
--- /dev/null
+++ b/excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32f95376f4e987fa28397948d492203a5a34950ef22e6c798ac79d1f0919addd
+size 18998203
diff --git a/excerpt-data/ac92d419f271d95f/url.txt b/excerpt-data/ac92d419f271d95f/url.txt
new file mode 100644
index 0000000..882caf4
--- /dev/null
+++ b/excerpt-data/ac92d419f271d95f/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3e534bb315ac040576f817d5badb3f20b9ccbd43b8d72ff4bb512747efa77e2
+size 110
diff --git a/excerpt-data/afba4597f963e400/phenotype.hpoa b/excerpt-data/afba4597f963e400/phenotype.hpoa
deleted file mode 100644
index 56d35a6..0000000
--- a/excerpt-data/afba4597f963e400/phenotype.hpoa
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:27483a513d23276d21799803390b6aebc3194242a9996e7225262133a8f2ff7c
-size 31413071
diff --git a/excerpt-data/afba4597f963e400/url.txt b/excerpt-data/afba4597f963e400/url.txt
deleted file mode 100644
index 6e8bd6d..0000000
--- a/excerpt-data/afba4597f963e400/url.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fd376b656093f1bd4778a32bae021cd22ec6e282b8737a143964cc4c26d2fd84
-size 102
diff --git a/excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt b/excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt
deleted file mode 100644
index ea9f224..0000000
--- a/excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47f2f26a9767921165beb34f11d00bf6bc4fb1815613d4433403df68f2b6c52c
-size 18182540
diff --git a/excerpt-data/e3f85776a4d3a44b/url.txt b/excerpt-data/e3f85776a4d3a44b/url.txt
deleted file mode 100644
index 6223501..0000000
--- a/excerpt-data/e3f85776a4d3a44b/url.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:17384bdd61a8c02121400083229f431a6ab2f8bcceb1b0cebbf1a2e802a7933d
-size 110
diff --git a/excerpt-data/e4097d7046a53998/omim_import.obo b/excerpt-data/e4097d7046a53998/omim_import.obo
new file mode 100644
index 0000000..9c8c55a
--- /dev/null
+++ b/excerpt-data/e4097d7046a53998/omim_import.obo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0aacf57a9ef7d737f08915c0160b2174453eaa71061ee47753febb353daee12e
+size 9631131
diff --git a/excerpt-data/e4097d7046a53998/url.txt b/excerpt-data/e4097d7046a53998/url.txt
new file mode 100644
index 0000000..c8e55db
--- /dev/null
+++ b/excerpt-data/e4097d7046a53998/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be34d29606ea9f6ad67d5d7c777d963b46c024a85ab2709d4f66c46b326ea704
+size 120
diff --git a/excerpt-data/ede624c2cd588939/omim-unmapped.csv b/excerpt-data/ede624c2cd588939/omim-unmapped.csv
new file mode 100644
index 0000000..5f421b3
--- /dev/null
+++ b/excerpt-data/ede624c2cd588939/omim-unmapped.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36bed4998159bf50c84fa2a39f79210723c63953586fb5ab01f11327c03db741
+size 278605
diff --git a/excerpt-data/ede624c2cd588939/url.txt b/excerpt-data/ede624c2cd588939/url.txt
new file mode 100644
index 0000000..2740d94
--- /dev/null
+++ b/excerpt-data/ede624c2cd588939/url.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd7010cd3fca806cade1ebb0ed577a723f197b48b486bf631c0e71dc2f3f523
+size 106
diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk
index b461a34..4471bc5 100644
--- a/rules/output/annonars/genes.smk
+++ b/rules/output/annonars/genes.smk
@@ -11,8 +11,9 @@ rule output_annonars_genes:  # -- build annonars genes RocksDB file
         hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl",
         ncbi="work/genes/entrez/{date}/gene_info.jsonl",
         omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
-        orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
+        orpha="work/genes/orphadata/{date}/orpha_diseases.tsv",
         panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl",
+        conditions="work/genes/conditions/{v_hpo}+{date}/conditions.jsonl",
         rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
         shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
         gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
@@ -20,11 +21,11 @@ rule output_annonars_genes:  # -- build annonars genes RocksDB file
         decipher_hi="work/genes/decipher/v3/decipher_hi_prediction.tsv.gz",
     output:
         rocksdb_identity=(
-            "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
+            "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/"
             "rocksdb/IDENTITY"
         ),
         spec_yaml=(
-            "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/"
+            "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/"
             "spec.yaml"
         ),
     wildcard_constraints:
@@ -35,11 +36,17 @@ rule output_annonars_genes:  # -- build annonars genes RocksDB file
         v_annonars=RE_VERSION,
     shell:
         r"""
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
         annonars gene import \
             --path-out-rocksdb $(dirname {output.rocksdb_identity}) \
             --path-in-acmg {input.acmg_sf} \
             --path-in-clingen-37 {input.clingen_37} \
             --path-in-clingen-38 {input.clingen_38} \
+            --path-in-conditions {input.conditions} \
             --path-in-gnomad-constraints {input.gnomad_constraints} \
             --path-in-dbnsfp {input.dbnsfp} \
             --path-in-hgnc {input.hgnc} \
@@ -64,7 +71,6 @@ rule output_annonars_genes:  # -- build annonars genes RocksDB file
             --value date={wildcards.date} \
             \
             --value v_annonars={wildcards.v_annonars} \
-            --value v_downloader={PV.downloader} \
-            --value v_orphapacket={wildcards.v_orpha} \
+            --value v_downloader={PV.downloader}
         > {output.spec_yaml}
         """
diff --git a/rules/output/annonars/genes.spec.yaml b/rules/output/annonars/genes.spec.yaml
index e4cd860..21b050e 100644
--- a/rules/output/annonars/genes.spec.yaml
+++ b/rules/output/annonars/genes.spec.yaml
@@ -38,7 +38,7 @@ x-created-from:
   - name: OMIM
     version: {{ today }}
   - name: ORDO
-    version: {{ v_orphapacket }}
+    version: {{ v_ordo }}
   - name: rCNV pHaplo/pTriplo scores
     version: 2022-Collins-et-al
   - name: sHet scores
diff --git a/rules/output/worker/hgnc.smk b/rules/output/worker/hgnc.smk
index 9ff2ddc..1babd54 100644
--- a/rules/output/worker/hgnc.smk
+++ b/rules/output/worker/hgnc.smk
@@ -9,6 +9,11 @@ rule output_hgnc_xlink_binary:
         spec_yaml=f"output/full/worker/genes-xlink-{{date}}+{PV.worker}/genes-xlink.spec.yaml",
     shell:
         r"""
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
         varfish-server-worker db to-bin \
             --input-type xlink \
             --path-input {input.tsv} \
diff --git a/rules/work/genes/conditions.smk b/rules/work/genes/conditions.smk
new file mode 100644
index 0000000..2f65175
--- /dev/null
+++ b/rules/work/genes/conditions.smk
@@ -0,0 +1,39 @@
+## Rules for aggregating per-gene conditions data for annonars.
+
+
+rule work_conditions_integrate:  # integrate conditions file
+    input:
+        mim2gene_medgen="work/download/genes/ncbi/{date}/mim2gene_medgen",
+        genes_xlink="output/full/mehari/genes-xlink-{date}/genes-xlink.tsv",
+        hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa",
+        orpha_jsonl="work/genes/orphadata/{date}/orphadata.jsonl",
+        panelapp_jsonl="work/download/genes/panelapp/{date}/panelapp.jsonl",
+        mondo_obo="work/genes/mondo/{date}/mondo.obo",
+        mondo_unmapped_tsv="work/genes/mondo/{date}/omim_unmapped_terms.tsv",
+        ctd_tsv="work/download/genes/ctd/{date}/CTD_diseases.tsv.gz",
+        do_omim_unmapped="work/download/do/{date}/omim-unmapped.csv",
+        do_omim_indo="work/download/do/{date}/OMIMinDO.tsv",
+        do_omim_import="work/download/do/{date}/omim_import.obo",
+    output:
+        jsonl="work/genes/conditions/{v_hpo}+{date}/conditions.jsonl",
+    shell:
+        r"""
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
+        MIM2GENE_MEDGEN_PATH="{input.mim2gene_medgen}" \
+        GENES_XLINK_PATH="{input.genes_xlink}" \
+        HPOA_PATH="{input.hpoa}" \
+        ORPHA_JSONL_PATH="{input.orpha_jsonl}" \
+        PANELAPP_JSONL_PATH="{input.panelapp_jsonl}" \
+        MONDO_OBO_PATH="{input.mondo_obo}" \
+        MONDO_UNMAPPED_OMIM_PATH="{input.mondo_unmapped_tsv}" \
+        CTD_PATH="{input.ctd_tsv}" \
+        DO_OMIM_UNMAPPED_PATH="{input.do_omim_unmapped}" \
+        DO_OMIM_INDO_PATH="{input.do_omim_indo}" \
+        DO_OMIM_IMPORT_PATH="{input.do_omim_import}" \
+        python scripts/genes-integrate-diseases.py \
+        > {output.jsonl}
+        """
diff --git a/rules/work/genes/ctd.smk b/rules/work/genes/ctd.smk
new file mode 100644
index 0000000..688a250
--- /dev/null
+++ b/rules/work/genes/ctd.smk
@@ -0,0 +1,15 @@
+## Rules related to CTD download
+
+
+rule genes_ctd_download:  # -- download CTD
+    output:
+        tsv="work/download/genes/ctd/{date}/CTD_diseases.tsv.gz",
+    shell:
+        """
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
+        wget -O {output.tsv} https://ctdbase.org/reports/CTD_diseases.tsv.gz
+        """
diff --git a/rules/work/genes/decipher.smk b/rules/work/genes/decipher.smk
index 56b51a7..5537679 100644
--- a/rules/work/genes/decipher.smk
+++ b/rules/work/genes/decipher.smk
@@ -38,19 +38,20 @@ rule genes_decipher_hi_convert:  # -- convert DECIPHER HI predictions to TSV
         | sed -e 's/%$//g' \
         >> $TMPDIR/tmp.tsv
 
-        qsv join \
-            gene_symbol {input.hgnc} \
+        qsv select -d $'\t' '!omim_ids' {input.hgnc} \
+        | tr ',' '\t' \
+        > $TMPDIR/tmp3.tsv
+
+        qsv join -d $'\t' \
+            gene_symbol $TMPDIR/tmp3.tsv \
             gene_symbol $TMPDIR/tmp.tsv \
-        > $TMPDIR/tmp2.tsv
-
-        ( \
-            echo -e "hgnc_id\thgnc_symbol\tp_hi\thi_index"; \
-            tail -n +2 $TMPDIR/tmp2.tsv \
-            | tr ',' '\t' \
-            | cut -f 1,5-7 \
-            | LC_ALL=C sort \
-        ) \
-        | gzip -c \
+        > $TMPDIR/tmp2.csv
+
+        qsv select \
+            'hgnc_id,gene_symbol,p_hi,hi_index' \
+            $TMPDIR/tmp2.csv \
+        | qsv rename 'hgnc_id,hgnc_symbol,p_hi,hi_index' \
+        | tr ',' '\t' \
         > {output.tsv}
 
         md5sum {output.tsv} > {output.tsv_md5}
diff --git a/rules/work/genes/do.smk b/rules/work/genes/do.smk
new file mode 100644
index 0000000..a9f1627
--- /dev/null
+++ b/rules/work/genes/do.smk
@@ -0,0 +1,24 @@
+## Rules related to DO download
+
+
+rule work_disease_ontology_download:  # -- download DO files
+    output:
+        unmapped_csv="work/download/do/{date}/omim-unmapped.csv",
+        omimindo_tsv="work/download/do/{date}/OMIMinDO.tsv",
+        omim_import="work/download/do/{date}/omim_import.obo",
+    shell:
+        r"""
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
+        wget -O {output.unmapped_csv} \
+            https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/reports/omim-unmapped.csv
+        wget -O {output.omimindo_tsv} \
+            https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/DOreports/OMIMinDO.tsv
+        wget -O {output.omim_import} \
+            https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/DO_NON_Production_Files/omim_import.obo
+        perl -p -i -e 's/(is_a:[^!]*?),.*/\1/g' {output.omim_import}
+        perl -p -i -e 's/OMIM:PS/OMIM:/g' {output.omim_import}
+        """
diff --git a/rules/work/genes/mondo.smk b/rules/work/genes/mondo.smk
new file mode 100644
index 0000000..9d67396
--- /dev/null
+++ b/rules/work/genes/mondo.smk
@@ -0,0 +1,14 @@
+## Rules related to MONDO data
+
+
+rule genes_mondo_download:  # -- download orphadatas
+    output:
+        obo="work/genes/mondo/{version}/mondo.obo",
+        omim_unmapped="work/genes/mondo/{version}/omim_unmapped_terms.tsv",
+    shell:
+        """
+        wget -O {output.obo} \
+            http://purl.obolibrary.org/obo/mondo.obo
+        wget -O {output.omim_unmapped} \
+            https://raw.githubusercontent.com/monarch-initiative/mondo-ingest/main/src/ontology/reports/omim_unmapped_terms.tsv
+        """
diff --git a/rules/work/genes/omim.smk b/rules/work/genes/omim.smk
index d33136e..bb726ee 100644
--- a/rules/work/genes/omim.smk
+++ b/rules/work/genes/omim.smk
@@ -12,8 +12,13 @@ rule genes_omim:  # -- prepare HGNC to OMIM disease mapping
         """
         set -x
 
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
         export TMPDIR=$(mktemp -d)
-        # trap "rm -rf $TMPDIR" ERR EXIT
+        trap "rm -rf $TMPDIR" ERR EXIT
 
         head -n 1 {input.mim2gene} | sed -e 's/ /_/g' -e 's/#//g' \
         > $TMPDIR/mim2gene.tsv
diff --git a/rules/work/genes/orphadata.smk b/rules/work/genes/orphadata.smk
new file mode 100644
index 0000000..0414e91
--- /dev/null
+++ b/rules/work/genes/orphadata.smk
@@ -0,0 +1,23 @@
+## Rules related to Orphadata download
+
+
+rule genes_orphadata_download:  # -- download orphadatas
+    output:
+        jsonl="work/genes/orphadata/{date}/orphadata.jsonl",
+        jsonl_md5="work/genes/orphadata/{date}/orphadata.jsonl.md5",
+        tsv="work/genes/orphadata/{date}/orpha_diseases.tsv",  # xxx
+    shell:
+        """
+        if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
+            >&2 echo "{wildcards.date} is not today"
+            exit 1
+        fi
+
+        python ./scripts/genes-orpha-diseases.py \
+        > {output.jsonl}
+
+        md5sum {output.jsonl} > {output.jsonl}.md5
+
+        echo -e "hgnc_id\torpha_id\tassoc_status\tomim_ids\tdisease_name\tdefinition" \
+        > {output.tsv}
+        """
diff --git a/rules/work/genes/orphapacket.smk b/rules/work/genes/orphapacket.smk
deleted file mode 100644
index b5201a5..0000000
--- a/rules/work/genes/orphapacket.smk
+++ /dev/null
@@ -1,34 +0,0 @@
-## Rules related to annotating genes with ORDO terms
-
-
-rule genes_orphapacket_download:  # -- download orphapacket file
-    output:
-        tar="work/download/genes/orphapacket/{version}/orphapacket.tar.gz",
-    shell:
-        r"""
-        wget -O {output.tar} \
-            https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz
-        """
-
-
-rule genes_orphapacket_diseases:  # -- postprocess file for HGNC gene IDs
-    input:
-        tar="work/download/genes/orphapacket/{version}/orphapacket.tar.gz",
-        xlink="output/full/mehari/genes-xlink-{date}/genes-xlink.tsv",
-    output:
-        tsv="work/genes/orphapacket/{version}+{date}/orpha_diseases.tsv",
-        tsv_md5="work/genes/orphapacket/{version}+{date}/orpha_diseases.tsv.md5",
-    shell:
-        """
-        export TMPDIR=$(mktemp -d)
-        trap "rm -rf $TMPDIR" ERR EXIT
-
-        tar -C $TMPDIR -xf $(readlink -f {input.tar})
-
-        python ./scripts/genes-orpha-diseases.py {input.xlink} $TMPDIR/orphapacket-*/json \
-        | qsv sort -d '\t' \
-        | qsv fmt -t '\t' \
-        > {output.tsv}
-
-        md5sum {output.tsv} > {output.tsv}.md5
-        """
diff --git a/scripts/genes-integrate-diseases.py b/scripts/genes-integrate-diseases.py
new file mode 100644
index 0000000..7e975ef
--- /dev/null
+++ b/scripts/genes-integrate-diseases.py
@@ -0,0 +1,1321 @@
+"""Create integrated gene to disease map."""
+
+import csv
+import enum
+import gzip
+import json
+import os
+import pickle
+import re
+import sys
+from itertools import chain
+from typing import Dict, List, Optional, Set, Tuple
+
+import pronto
+from loguru import logger
+from pydantic import BaseModel, ConfigDict, Field
+
+#: Path to HGNC:ID to OMIM ID mapping file.
+MIM2GENE_MEDGEN_PATH = os.environ["MIM2GENE_MEDGEN_PATH"]
+#: Path to genes xlink file.
+GENES_XLINK_PATH = os.environ["GENES_XLINK_PATH"]
+#: Path to HPOA file.
+HPOA_PATH = os.environ["HPOA_PATH"]
+#: Path to the ORPHA JSONL file.
+ORPHA_JSONL_PATH = os.environ["ORPHA_JSONL_PATH"]
+#: Path to the PanelApp JSONL file.
+PANELAPP_JSONL_PATH = os.environ["PANELAPP_JSONL_PATH"]
+#: Path to the MONDO OBO file.
+MONDO_OBO_PATH = os.environ["MONDO_OBO_PATH"]
+#: Path to the MONDO unmapped OMIM file.
+MONDO_UNMAPPED_OMIM_PATH = os.environ["MONDO_UNMAPPED_OMIM_PATH"]
+#: Path to CTD file.
+CTD_PATH = os.environ["CTD_PATH"]
+#: DO OMIM "ummapped" file.
+DO_OMIM_UNMAPPED_PATH = os.environ["DO_OMIM_UNMAPPED_PATH"]
+#: DO OMIMinDO file.
+DO_OMIM_INDO_PATH = os.environ["DO_OMIM_INDO_PATH"]
+#: DO legacy OMIM import file.
+DO_OMIM_IMPORT_PATH = os.environ["DO_OMIM_IMPORT_PATH"]
+
+#: Development mode => pickling of data
+DEV_MODE = os.environ.get("DEV_MODE", "0") == "1"
+
+
+class GeneXlink(BaseModel):
+    """Cross-reference of gene."""
+
+    #: HGNC ID.
+    hgnc_id: str
+    #: ENSEMBL gene ID.
+    ensembl_gene_id: Optional[str]
+    #: Entrez gene ID.
+    entrez_id: Optional[str]
+    #: Gene symbol
+    gene_symbol: str
+    #: OMIM IDs
+    omim_ids: List[str]
+
+
+def parse_gene_xlink(path: str) -> List[GeneXlink]:
+    """Parse gene cross-reference file."""
+    rows = []
+    with open(path) as f:
+        reader = csv.DictReader(f, delimiter="\t")
+        for row in reader:
+            for key in ("ensembl_gene_id", "entrez_id"):
+                if row[key] == "":
+                    row[key] = None
+            if not row["omim_ids"]:
+                row["omim_ids"] = []
+            else:
+                row["omim_ids"] = row["omim_ids"].split(",")
+            rows.append(GeneXlink(**row))
+    return rows
+
+
+class Mim2geneMedgen(BaseModel):
+    """Mapping from OMIM ID to gene."""
+
+    model_config = ConfigDict(frozen=True)
+
+    #: OMIM ID.
+    omim_id: str
+    #: Gene HGNC ID.
+    hgnc_id: str
+
+    def __lt__(self, other: "Mim2geneMedgen") -> bool:
+        """Compare mim2gene_medgen entries."""
+        return (self.hgnc_id, self.omim_id) < (other.hgnc_id, other.omim_id)
+
+
+def parse_mim2gene_medgen(
+    path: str, xlink_by_entrez_id: Dict[str, GeneXlink]
+) -> List[Mim2geneMedgen]:
+    """Parse mim2gene_medgen file."""
+    result = []
+    with open(path, "rt") as inputf:
+        reader = csv.DictReader(inputf, delimiter="\t")
+        for row in reader:
+            if row["GeneID"] == "-" or row["type"] != "phenotype":
+                continue
+            if row["GeneID"] not in xlink_by_entrez_id:
+                logger.warning(f"Skipping NCBI Gene ID = {row['GeneID']}")
+                continue
+            result.append(
+                Mim2geneMedgen(
+                    omim_id=f"OMIM:{row['#MIM number']}",
+                    hgnc_id=xlink_by_entrez_id[row["GeneID"]].hgnc_id,
+                )
+            )
+    return list(sorted(set(result)))
+
+
+@enum.unique
+class Evidence(enum.Enum):
+    """Evidence from HPOA record"""
+
+    #: Has been inferred by parsing OMIM.
+    INFERRED_FROM_ELECTRONIC_ANNOTATION = "IEA"
+    #: Extracted from articles in medical literature.
+    PUBLISHED_CLINICAL_STUDY = "PCS"
+    #: Extracted from knowledge bases such as OMIM and Orphanet.
+    TRACEABLE_AUTHOR_STATEMENT = "TAS"
+
+
+@enum.unique
+class Sex(enum.Enum):
+    """Enumeration for sex."""
+
+    #: Male
+    MALE = "MALE"
+    #: Female
+    FEMALE = "FEMALE"
+
+
+@enum.unique
+class Aspect(enum.Enum):
+    """Enumeration for aspect."""
+
+    #: Phenotype
+    PHENOTYPE = "P"
+    #: Mode of inheritance
+    MODE_OF_INHERITANCE = "I"
+    #: Clinical course
+    CLINICAL_COURSE = "C"
+    #: Modifier
+    CLINICAL_MODIFIER = "M"
+
+
+class HpoaEntry(BaseModel):
+    """Disease-phenotype association from HPOA file."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    #: Term database ID.
+    database_id: str
+    #: Term database name.
+    disease_name: str
+    #: Whether is a positive or negative association.
+    qualifier: bool
+    #: HPO ID.
+    hpo_id: str
+    #: Evidence for this association.
+    reference: List[str]
+    #: Evidence for this association.
+    evidence: Evidence
+    #: HPO term for onset.
+    onset: str
+    #: Frequency annotation.
+    frequency: str
+    #: Sex link, if any.
+    sex: Optional[Sex]
+    #: HPO terms of modifiers.
+    modifier: List[str]
+    #: Aspect of this association.
+    aspect: Aspect
+    #: Biocuration notes.
+    biocuration: List[str]
+
+
+def parse_hpoa(path: str) -> List[HpoaEntry]:
+    """Parse HPOA file."""
+    rows = []
+    with open(path) as f:
+        for i in range(4):
+            next(f)
+        reader = csv.DictReader(f, delimiter="\t")
+        for row in reader:
+            row["qualifier"] = row["qualifier"] != "NOT"
+            row["reference"] = row["reference"].split(";")
+            row["evidence"] = Evidence(row["evidence"])
+            if row["sex"]:
+                row["sex"] = Sex(row["sex"].upper())
+            else:
+                row["sex"] = None
+            row["modifier"] = row["modifier"].split(";")
+            row["aspect"] = Aspect(row["aspect"])
+            row["biocuration"] = row["biocuration"].split(";")
+            rows.append(HpoaEntry(**row))
+    return rows
+
+
+class DiseaseNameLabel(BaseModel):
+    """Association of a diasease database ID to its name."""
+
+    #: Disease database ID.
+    database_id: str
+    #: Name in the database.
+    disease_name: str
+
+
+def hpoa_entries_to_disease_label_map(hpoa_entries: List[HpoaEntry]) -> List[DiseaseNameLabel]:
+    """Create a map of disease database ID to disease name."""
+    disease_label_map = {}
+    for entry in hpoa_entries:
+        disease_label_map[entry.database_id] = DiseaseNameLabel(
+            database_id=entry.database_id, disease_name=entry.disease_name
+        )
+    return disease_label_map
+
+
+@enum.unique
+class DisorderMappingRelation(enum.Enum):
+    """Characterisation of disorder mapping relation."""
+
+    #: Exact mapping
+    EXACT = "E"
+    #: ORPHA code narrower term maps to a broader term
+    NTBT = "NTBT"
+    #: ORPHA code broader term maps to a narrower term
+    BTNT = "BTNT"
+    #: ORPHA code narrower term maps to a broader term because of an exact
+    #: match with an synonym in the target terminology.
+    NTBT_E = "NTBT/E"
+    #: ORPHA code broader term maps to a narrower term because of an exact
+    #: match with an synonym in the target terminology.
+    BTNT_E = "BTNT/E"
+    #: Not yet decied / unable to decide.
+    ND = "ND"
+
+
+class OrphaDisorderMapping(BaseModel):
+    """Mapping into a external/target disorder terminology."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    #: Identifier of source terminology with prefix of the terminology.
+    database_id: str
+    #: The relationship between ORPHA and the target terminology.
+    relation: DisorderMappingRelation
+
+
+class OrphaDisorder(BaseModel):
+    """Description of the ORPHA disorder."""
+
+    #: The "ORPHA:<ID>" ID.
+    database_id: str
+    #: The name of the disorder.
+    database_name: str
+    #: Definition, if provided.
+    definition: Optional[str]
+    #: Synonym list.
+    synonyms: List[str]
+    #: Mappings in external terminologies.
+    mappings: List[OrphaDisorderMapping]
+
+
+@enum.unique
+class OrphaStatus(enum.Enum):
+    """ORPHA assessment status."""
+
+    #: Assessed
+    ASSESSED = "Assessed"
+    #: Not yet assessed
+    NOT_YET_ASSESSED = "Not yet assessed"
+
+
+@enum.unique
+class OrphaAssocationType(enum.Enum):
+    """ORPHA association type."""
+
+    ROLE = "Role in the phenotype of"
+    GERMLINE_LOF = "Disease-causing germline mutation(s) (loss of function) in"
+    GERMLINE_GOF = "Disease-causing germline mutation(s) (gain of function) in"
+    GERMLINE = "Disease-causing germline mutation(s) in"
+    SOMATIC = "Disease-causing somatic mutation(s) in"
+    MODIFYING_GERMLINE = "Modifying germline mutation in"
+    FUSION_GENE = "Part of a fusion gene in"
+    SUSCEPTIBILITY = "Major susceptibility factor in"
+    CANDIDATE_GENE = "Candidate gene tested in"
+    BIOMARKER = "Biomarker tested in"
+
+
+class OrphaGeneMapping(BaseModel):
+    """Mapping from an ORPHA disorder to a gene."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    #: ORPHA ID.
+    orpha_id: str
+    #: Gene HGNC id.
+    hgnc_id: str
+    #: Assessment status.
+    status: OrphaStatus
+    #: The type of the assessment.
+    association_type: OrphaAssocationType
+
+
+def parse_orpha_jsonl(
+    path: str, xlink_by_gene_symbol: Dict[str, GeneXlink]
+) -> Tuple[List[OrphaDisorder], List[OrphaGeneMapping]]:
+    """Parse ORPHA JSONL file."""
+    disorders = []
+    gene_mappings = []
+    with open(path) as inputf:
+        for line in inputf:
+            record = json.loads(line)
+            # Extract disease details and cross-references to other terminologies.
+            disorder_mappings = []
+            for mapping in record["cross_references"].get("ExternalReference") or []:
+                disorder_mappings.append(
+                    OrphaDisorderMapping(
+                        database_id=f"{mapping['Source']}:{mapping['Reference']}",
+                        relation=DisorderMappingRelation(
+                            mapping["DisorderMappingRelation"].split(" ")[0]
+                        ),
+                    )
+                )
+            summary_information = record["cross_references"].get("SummaryInformation") or []
+            disorder = OrphaDisorder(
+                database_id=record["orpha_id"],
+                database_name=record["cross_references"]["Preferred term"],
+                definition=summary_information[0].get("Definition")
+                if summary_information
+                else None,
+                synonyms=record["cross_references"].get("Synonym") or [],
+                mappings=disorder_mappings,
+            )
+            disorders.append(disorder)
+            # Extract gene mappings.
+            for assoc in (record.get("disease_genes") or {}).get("DisorderGeneAssociation") or []:
+                hgnc_id = None
+                for ext_ref in assoc["Gene"]["ExternalReference"] or []:
+                    if ext_ref["Source"] == "HGNC":
+                        hgnc_id = f"HGNC:{ext_ref['Reference']}"
+                if hgnc_id is None:
+                    if assoc["Gene"]["Symbol"] not in xlink_by_gene_symbol:
+                        logger.info(f"Skipping GeneType = {assoc['Gene']['GeneType']}")
+                        continue
+                    else:
+                        hgnc_id = xlink_by_gene_symbol[assoc["Gene"]["Symbol"]].hgnc_id
+                gene_mappings.append(
+                    OrphaGeneMapping(
+                        orpha_id=record["orpha_id"],
+                        hgnc_id=hgnc_id,
+                        status=OrphaStatus(assoc["DisorderGeneAssociationStatus"]),
+                        association_type=OrphaAssocationType(assoc["DisorderGeneAssociationType"]),
+                    )
+                )
+    return disorders, gene_mappings
+
+
+@enum.unique
+class PanelappConfidence(enum.Enum):
+    """Confidence level for PanelApp."""
+
+    #: Green
+    GREEN = "GREEN"
+    #: Amber
+    AMBER = "AMBER"
+    #: Red
+    RED = "RED"
+    #: None
+    NONE = "NONE"
+
+    def __lt__(self, other: "PanelappConfidence") -> bool:
+        """Compare confidence levels.
+
+        GREEN is smallest for sorting ergnonomics.
+        """
+        mapping = {
+            PanelappConfidence.GREEN: 0,
+            PanelappConfidence.AMBER: 1,
+            PanelappConfidence.RED: 2,
+            PanelappConfidence.NONE: 3,
+        }
+        return mapping[self] < mapping[other]
+
+
+@enum.unique
+class PanelappEntityType(enum.Enum):
+    """Entity type for PanelApp."""
+
+    #: Gene
+    GENE = "GENE"
+    #: Region
+    REGION = "REGION"
+    #: STR
+    STR = "STR"
+
+
+class PanelappPanel(BaseModel):
+    """Information about a PanelApp panel."""
+
+    #: Numeric ID
+    id: int
+    #: Name of the panel.
+    name: str
+    #: Version of the panel.
+    version: str
+
+
+class PanelappAssociation(BaseModel):
+    """A gene-to-disease association from PanelApp."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    #: Gene HGNC ID.
+    hgnc_id: str
+    #: Confidence level.
+    confidence_level: PanelappConfidence
+    #: Entity type.
+    entity_type: PanelappEntityType
+    #: Mode of inheritance.
+    mode_of_inheritance: Optional[str]
+    #: The phenotypes as list of strings.
+    phenotypes: List[str]
+    #: Information about the panel that is the source.
+    panel: PanelappPanel
+
+
+def parse_panelapp_jsonl(
+    path: str, xlink_by_gene_symbol: Dict[str, GeneXlink]
+) -> List[PanelappAssociation]:
+    entity_type_mapping = {
+        "gene": PanelappEntityType.GENE.value,
+        "region": PanelappEntityType.REGION.value,
+        "str": PanelappEntityType.STR.value,
+    }
+    confidence_level_mapping = {
+        "3": PanelappConfidence.GREEN.value,
+        "2": PanelappConfidence.AMBER.value,
+        "1": PanelappConfidence.RED.value,
+        "0": PanelappConfidence.NONE.value,
+    }
+
+    result = []
+    with open(path, "rt") as inputf:
+        for line in inputf:
+            record = json.loads(line)
+            entity_type = PanelappEntityType(entity_type_mapping[record["entity_type"]])
+            if entity_type not in [PanelappEntityType.GENE, PanelappEntityType.STR]:
+                continue
+            hgnc_id = record["gene_data"].get("hgnc_id")
+            if not hgnc_id:
+                hgnc_id = getattr(
+                    xlink_by_gene_symbol.get(record["gene_data"]["gene_symbol"]), "hgnc_id", None
+                )
+            if not hgnc_id:
+                logger.warn(
+                    f"Skipping record without HGNC ID and unmappable gene symbol: {record['gene_data']['gene_symbol']}"
+                )
+                continue
+            record["confidence_level"] = confidence_level_mapping[record["confidence_level"]]
+            assoc = PanelappAssociation(
+                hgnc_id=hgnc_id,
+                confidence_level=PanelappConfidence(record["confidence_level"]),
+                mode_of_inheritance=record["mode_of_inheritance"],
+                entity_type=entity_type,
+                phenotypes=record["phenotypes"],
+                panel=PanelappPanel(
+                    id=record["panel"]["id"],
+                    name=record["panel"]["name"],
+                    version=record["panel"]["version"],
+                ),
+            )
+            if assoc.confidence_level != PanelappConfidence.NONE:
+                result.append(assoc)
+    return result
+
+
+@enum.unique
+class MondoDiseaseRelation(enum.Enum):
+    """Characterisation of disorder mapping relation."""
+
+    #: Exact.
+    EXACT = "EXACT"
+    #: Related.
+    RELATED = "RELATED"
+    #: Broad.
+    BROAD = "BROAD"
+    #: Narrow.
+    NARROW = "NARROW"
+    #: Abbreviation.
+    ABBREVIATION = "ABBREVIATION"
+    #: Ambiguous.
+    AMBIGUOUS = "AMBIGUOUS"
+    #: Deprecated.
+    DEPRECATED = "DEPRECATED"
+    #: Dubious.
+    DUBIOUS = "DUBIOUS"
+    #: Exclude.
+    EXCLUDE = "EXCLUDE"
+    #: Misspelling.
+    MISSPELLING = "MISSPELLING"
+    #: Non-human.
+    NON_HUMAN = "NON-HUMAN"
+    #: UK spelling synonym.
+    OMO_0003005 = "OMO:0003005"
+
+
+class MondoSynonym(BaseModel):
+    """Mapping into another terminology."""
+
+    model_config = ConfigDict(use_enum_values=True)
+
+    #: Identifier of source terminology with prefix of the terminology.
+    database_id: str
+    #: The relationship between ORPHA and the target terminology.
+    relation: List[MondoDiseaseRelation]
+
+
+class MondoDisease(BaseModel):
+    """Relevant information from a MONDO disease."""
+
+    #: The MONDO term ID.
+    mondo_id: str
+    #: The name of the disease.
+    name: str
+    #: A list of synonyms.
+    synonyms: List[MondoSynonym]
+
+
+def parse_mondo_obo(path: str) -> List[MondoDisease]:
+    """Parse MONDO OBO file."""
+    result = []
+    mondo = pronto.Ontology(path)
+    for term in mondo.terms():
+        synonyms = []
+        for synonym in term.synonyms:
+            for xref in synonym.xrefs:
+                synonyms.append(
+                    MondoSynonym(
+                        database_id=xref.id,
+                        relation=list(map(MondoDiseaseRelation, synonym.scope.split(" "))),
+                    )
+                )
+        result.append(MondoDisease(mondo_id=term.id, name=term.name, synonyms=synonyms))
+    return result
+
+
+class MondoUnmappedOmim(BaseModel):
+    """Model for the MONDO unmapped OMIM file."""
+
+    subject_id: str
+    subject_label: str
+
+
+def parse_mondo_unmapped_omim(path: str) -> List[MondoUnmappedOmim]:
+    """Parse MONDO unmapped OMIM file."""
+    result = []
+    with open(path) as inputf:
+        for line in inputf:
+            if line.startswith("subject_id"):
+                pass
+            record = line.rstrip("\n").split("\t")
+            result.append(MondoUnmappedOmim(subject_id=record[0], subject_label=record[1]))
+    return result
+
+
+@enum.unique
+class ConfidenceLevel(enum.Enum):
+    """Confidence level for gene-disease association."""
+
+    #: High confidence.
+    #:
+    #: This corresponds to a match in OMIM, an "Assessed" entry in Orphanet, or a PanelApp entry
+    #: with GREEN color.
+    HIGH = "HIGH"
+    #: Medium confidence.
+    #:
+    #: This corresponds to a PanelApp entry with AMBER color.
+    MEDIUM = "MEDIUM"
+    #: Low confidence.
+    #:
+    #: This corresponds to a "Not yet assessed" entry in Orphanet or a PanelApp entry with RED
+    #: color.
+    LOW = "LOW"
+
+    def __lt__(self, other: "ConfidenceLevel") -> bool:
+        """Compare confidence levels.
+
+        HIGH is smallest for sorting ergnonomics.
+        """
+        mapping = {
+            ConfidenceLevel.HIGH: 3,
+            ConfidenceLevel.MEDIUM: 2,
+            ConfidenceLevel.LOW: 1,
+        }
+        return mapping[self] < mapping[other]
+
+
+@enum.unique
+class GeneDiseaseAssociationSource(enum.Enum):
+    """Enumeration for gene-disease association source."""
+
+    #: OMIM.
+    OMIM = "OMIM"
+    #: Orphanet.
+    ORPHANET = "ORPHANET"
+    #: PanelApp.
+    PANELAPP = "PANELAPP"
+
+    def __lt__(self, other: "ConfidenceLevel") -> bool:
+        """Compare confidence levels."""
+        return self.value < other.value
+
+
+class GeneDiseaseAssociationEntry(BaseModel):
+    """A source gene-disease association entry."""
+
+    #: The source.
+    source: GeneDiseaseAssociationSource
+    #: The confidence level.
+    confidence: ConfidenceLevel
+
+
+class LabeledDisorder(BaseModel):
+    """Disorder identifier with a label."""
+
+    model_config = ConfigDict(frozen=True)
+
+    #: ID of the disorder.
+    term_id: str
+    #: Label of the disorder.
+    title: Optional[str]
+
+    def __lt__(self, other: "LabeledDisorder") -> bool:
+        """Compare labeled disorders."""
+        return self.term_id < other.term_id
+
+
+class GeneDiseaseAssociation(BaseModel):
+    """Association of a gene to a disease."""
+
+    model_config = ConfigDict(frozen=True)
+
+    #: Gene HGNC ID.
+    hgnc_id: str
+    #: List of primary disease identifiers that diseases came from.
+    labeled_disorders: Tuple[LabeledDisorder, ...]
+    #: List of disease identifiers, used for clustering.
+    disease_ids: Tuple[str, ...] = Field(..., exclude=True)
+    #: Disease name for display, if any.
+    disease_name: Optional[str]
+    #: Disease definition from Orphanet, if available.
+    disease_definition: Optional[str]
+    #: The overall supporting sources.
+    sources: Tuple[GeneDiseaseAssociationSource, ...]
+    #: The overall confidence level.
+    confidence: ConfidenceLevel
+
+    def merge(self, other: "GeneDiseaseAssociation") -> "GeneDiseaseAssociation":
+        if self.hgnc_id != other.hgnc_id:
+            raise RuntimeError("Cannot merge different genes.")
+        labeled_disease_ids = tuple(
+            sorted(set(chain(self.labeled_disorders, other.labeled_disorders)))
+        )
+        disease_ids = tuple(sorted(set(chain(self.disease_ids, other.disease_ids))))
+        disease_name = self.disease_name or other.disease_name
+        disease_definition = self.disease_definition or other.disease_definition
+        sources = tuple(sorted(set(chain(self.sources, other.sources))))
+        if self.confidence < other.confidence:
+            confidence = other.confidence
+        else:
+            confidence = self.confidence
+        return GeneDiseaseAssociation(
+            hgnc_id=self.hgnc_id,
+            labeled_disorders=labeled_disease_ids,
+            disease_ids=disease_ids,
+            disease_name=disease_name,
+            disease_definition=disease_definition,
+            sources=sources,
+            confidence=confidence,
+        )
+
+
+class CtdDiseaseEntry(BaseModel):
+    """Entry in CTD disease database."""
+
+    model_config = ConfigDict(frozen=True)
+
+    disease_name: str
+    disease_id: str
+    alt_disease_ids: List[str]
+    definition: str
+    parent_ids: List[str]
+    tree_numbers: List[str]
+    parent_tree_numbers: List[str]
+    synonyms: List[str]
+    slim_mappings: List[str]
+
+
+def parse_ctd_disease_tsv(path: str) -> List[CtdDiseaseEntry]:
+    """Parse CTD disease database TSV file."""
+    result = []
+    with gzip.open(path, "rt") as inputf:
+        for line in inputf:
+            if line.startswith("#"):
+                continue
+            record = line.rstrip("\n").split("\t")
+            result.append(
+                CtdDiseaseEntry(
+                    disease_name=record[0],
+                    disease_id=record[1],
+                    alt_disease_ids=record[2].split("|"),
+                    definition=record[3],
+                    parent_ids=record[4].split("|"),
+                    tree_numbers=record[5].split("|"),
+                    parent_tree_numbers=record[6].split("|"),
+                    synonyms=record[7].split("|"),
+                    slim_mappings=record[8].split("|"),
+                )
+            )
+    return result
+
+
+class DoLabel(BaseModel):
+    """Label for a DO term."""
+
+    #: The OMIM id.
+    omim_id: str
+    #: The label.
+    label: str
+
+
+def parse_do_omim_unmapped(path: str) -> List[DoLabel]:
+    """Parse DO OMIM unmapped file."""
+    result = []
+    first = True
+    with open(path) as inputf:
+        for line in inputf:
+            record = line.rstrip("\n").split(",")
+            if first:
+                first = False
+            else:
+                result.append(DoLabel(omim_id=record[0], label=record[1]))
+    return result
+
+
+class OmimInDo(BaseModel):
+    """Entry in OMIMinDO file."""
+
+    model_config = ConfigDict(frozen=True)
+
+    id: str
+    label: str
+    xrefs: List[str]
+
+
+def parse_do_omim_in_do(path: str) -> List[OmimInDo]:
+    result = []
+    with open(path, "rt") as inputf:
+        reader = csv.DictReader(inputf, delimiter="\t", escapechar="\\")
+        for row in reader:
+            row["xrefs"] = [xref for xref in row["xrefs"].split(", ") if xref != "OMIM:genemap2"]
+            result.append(OmimInDo(**row))
+    return result
+
+
+def parse_do_omim_import(path: str) -> List[DoLabel]:
+    result = []
+    omim_import = pronto.Ontology(path)
+    for term in omim_import.terms():
+        if term.name:
+            result.append(DoLabel(omim_id=term.id, label=term.name))
+    return result
+
+
+class GeneConditionAssociation(BaseModel):
+    """Diseases and PanelApp information linked to a gene."""
+
+    #: Gene HGNC ID.
+    hgnc_id: str
+    #: The gene disease association
+    disease_associations: Tuple[GeneDiseaseAssociation, ...]
+    #: PanelApp panels with related genes.
+    panelapp_associations: Tuple[PanelappAssociation, ...]
+
+
+class ResultContainer(BaseModel):
+    """Container for results."""
+
+    #: The results.
+    results: Tuple[GeneConditionAssociation, ...]
+
+    def __lt__(self, other: "ResultContainer") -> bool:
+        """Compare two result containers."""
+        return self.results < other.results
+
+
+#: Some manual OMIM labels that are on MONDO exclusion lists etc.
+MANUAL_OMIM_LABELS = {
+    "OMIM:300932": "Thyroxine-binding globulin QTL",
+    "OMIM:615018": "Sd(a) polyagglutination syndrome",
+    "OMIM:617956": "Beta-glycopyranoside tasting",
+    "OMIM:617966": "Low density lipoprotein cholesterol level QTL 7",
+    "OMIM:617995": "IMPDH2 enzyme activity, variation in",
+    "OMIM:618079": "Low density lipoprotein cholesterol level QTL 8",
+    "OMIM:618807": "LPA deficiency, congenital",
+    "OMIM:619812": "Blood group, EMM system",
+    "OMIM:620116": "Fatty liver disease, protection from",
+    "OMIM:620207": "ER blood group system",
+}
+
+
+#: MondoDiseaseRelations that are considered to be "good enough" mappings.
+GOOD_MONDO_RELATIONS = (
+    MondoDiseaseRelation.EXACT.value,
+    MondoDiseaseRelation.NARROW.value,
+    MondoDiseaseRelation.BROAD.value,
+    MondoDiseaseRelation.RELATED.value,
+)
+#: DisorderMappingRelation that are considered to be "good enough" mappings.
+GOOD_ORPHA_RELATIONS = (
+    DisorderMappingRelation.EXACT.value,
+    DisorderMappingRelation.BTNT.value,  # ?
+    DisorderMappingRelation.BTNT_E.value,
+    DisorderMappingRelation.NTBT.value,
+    DisorderMappingRelation.NTBT_E.value,  # ?
+)
+
+
+class GeneDiseaseKey(BaseModel):
+    """Key for a gene-disease association."""
+
+    model_config = ConfigDict(frozen=True)
+
+    #: Gene HGNC ID.
+    hgnc_id: str
+    #: Disease database ID.
+    disease_id: str
+
+
+class Integrator:
+    """Implementation of the integration algorithm."""
+
+    # data loaded / pickled
+
+    gene_xlinks: List[GeneXlink]
+    xlink_by_hgnc_id: Dict[str, GeneXlink]
+    xlink_by_gene_symbol: Dict[str, GeneXlink]
+    xlink_by_entrez_id: Dict[str, GeneXlink]
+    xlink_by_omim_id: Dict[str, GeneXlink]
+    mim2gene_medgen: List[Mim2geneMedgen]
+    hpoa_entries: List[HpoaEntry]
+    disease_label_map: Dict[str, DiseaseNameLabel]
+    orpha_disorders: List[OrphaDisorder]
+    orpha_mappings: List[OrphaGeneMapping]
+    panelapp_associations: List[PanelappAssociation]
+    mondo_diseases: List[MondoDisease]
+    mondo_unmapped_omim: List[MondoUnmappedOmim]
+    ctd_diseases: List[CtdDiseaseEntry]
+    do_unmapped_labels: List[DoLabel]
+    omim_in_dos: List[OmimInDo]
+
+    # other data
+
+    #: OMIM titles
+    omim_titles: Dict[str, str]
+    #: Mappings from MONDO to ORPHA
+    mondo_to_orpha: Dict[str, Set[str]]
+    #: Mappings from MONDO to OMIM
+    mondo_to_omim: Dict[str, Set[str]]
+    #: Mappings from ORPHA to OMIM
+    orpha_to_omim: Dict[str, Set[str]]
+    #: Mappings from OMIM to ORPHA
+    omim_to_orpha: Dict[str, Set[str]]
+
+    def __init__(self):
+        """Initialise the integrator."""
+        #: Mapping from `(gene_hgnc_id, disease_id)` to `GeneDiseaseAssociationEntry`.
+        self.disease_assocs: Dict[GeneDiseaseKey, GeneDiseaseAssociation] = {}
+        #: Mapping from `hgnc_id` to list of `PanelappAssociation`s.
+        self.panelapp_assocs: Dict[str, List[PanelappAssociation]] = {}
+
+    def register_disease_assoc(self, assoc: GeneDiseaseAssociation):
+        """Register a gene-disease association."""
+        found_list = set()
+        for disease_id in assoc.disease_ids:
+            key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id)
+            if key in self.disease_assocs:
+                found_list.add(self.disease_assocs[key])
+        if not found_list:
+            for disease_id in assoc.disease_ids:
+                key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id)
+                self.disease_assocs[key] = assoc
+        else:
+            if len(found_list) != 1:
+                logger.warning(f"Found multiple associations for {assoc.hgnc_id}")
+            for found in found_list:
+                found = found.merge(assoc)
+                for disease_id in assoc.disease_ids:
+                    key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id)
+                    self.disease_assocs[key] = found
+
+    def run(self, pickle_path: Optional[str] = None):
+        logger.info("Building gene-disease map...")
+        self.load_data(pickle_path)
+        self.handle_orpha()
+        self.handle_mim2gene_medgen()
+        self.handle_panelapp()
+
+        conditions_by_hgnc = {
+            hgnc_id: GeneConditionAssociation(
+                hgnc_id=hgnc_id, disease_associations=[], panelapp_associations=[]
+            )
+            for hgnc_id in sorted(
+                set(
+                    chain(
+                        (k.hgnc_id for k in self.disease_assocs.keys()), self.panelapp_assocs.keys()
+                    )
+                )
+            )
+        }
+        for hgnc_id, assocs in self.panelapp_assocs.items():
+            conditions_by_hgnc[hgnc_id] = conditions_by_hgnc[hgnc_id].model_copy(
+                update={
+                    "panelapp_associations": tuple(sorted(assocs, key=lambda a: a.confidence_level))
+                }
+            )
+        for key, assoc in self.disease_assocs.items():
+            disease_assoc = conditions_by_hgnc[key.hgnc_id].disease_associations
+            conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy(
+                update={
+                    "disease_associations": tuple(
+                        sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence)
+                    )
+                }
+            )
+        result = ResultContainer(results=tuple(conditions_by_hgnc.values()))
+        for assoc in result.results:
+            json.dump(obj=assoc.model_dump(mode="json"), fp=sys.stdout)
+            sys.stdout.write("\n")
+        logger.info("All done. Have a nice day.")
+
+    def load_data(self, pickle_path: Optional[str] = None):  # noqa: C901
+        if pickle_path and os.path.exists(pickle_path):
+            logger.info("unpickling...")
+            with gzip.open(pickle_path, "rb") as inputf:
+                (
+                    self.gene_xlinks,
+                    self.mim2gene_medgen,
+                    self.hpoa_entries,
+                    self.disease_label_map,
+                    self.orpha_disorders,
+                    self.orpha_mappings,
+                    self.panelapp_associations,
+                    self.mondo_diseases,
+                    self.mondo_unmapped_omim,
+                    self.ctd_diseases,
+                    self.do_unmapped_labels,
+                    self.omim_in_dos,
+                    self.do_omim_import,
+                ) = pickle.load(inputf)
+            self.xlink_by_entrez_id = {
+                xlink.entrez_id: xlink for xlink in self.gene_xlinks if xlink.entrez_id
+            }
+            self.xlink_by_gene_symbol = {xlink.gene_symbol: xlink for xlink in self.gene_xlinks}
+        else:
+            logger.info("Parsing genes xlink file...")
+            self.gene_xlinks = parse_gene_xlink(GENES_XLINK_PATH)
+            self.xlink_by_entrez_id = {
+                xlink.entrez_id: xlink for xlink in self.gene_xlinks if xlink.entrez_id
+            }
+            logger.info("Parsing mim2gene_medgen file...")
+            self.mim2gene_medgen = parse_mim2gene_medgen(
+                MIM2GENE_MEDGEN_PATH, self.xlink_by_entrez_id
+            )
+            logger.info("Parsing HPOA file...")
+            self.hpoa_entries = parse_hpoa(HPOA_PATH)
+            logger.info("Converting to disease label map...")
+            self.disease_label_map = hpoa_entries_to_disease_label_map(self.hpoa_entries)
+            # print(list(disease_label_map.items())[:10])
+            logger.info("Parsing ORPHA JSONL file...")
+            self.xlink_by_gene_symbol = {xlink.gene_symbol: xlink for xlink in self.gene_xlinks}
+            self.orpha_disorders, self.orpha_mappings = parse_orpha_jsonl(
+                ORPHA_JSONL_PATH, self.xlink_by_gene_symbol
+            )
+            # print(json.dumps([d.model_dump() for d in disorders[:10]], indent=2))
+            logger.info("Parsing PanelApp JSONL file...")
+            self.panelapp_associations = parse_panelapp_jsonl(
+                PANELAPP_JSONL_PATH, self.xlink_by_gene_symbol
+            )
+
+            logger.info("Loading MONDO files...")
+            self.mondo_diseases = parse_mondo_obo(MONDO_OBO_PATH)
+            self.mondo_unmapped_omim = parse_mondo_unmapped_omim(MONDO_UNMAPPED_OMIM_PATH)
+            logger.info("Loading CTD disease file...")
+            self.ctd_diseases = parse_ctd_disease_tsv(CTD_PATH)
+            logger.info("Loading DO files...")
+            self.do_unmapped_labels = parse_do_omim_unmapped(DO_OMIM_UNMAPPED_PATH)
+            self.omim_in_dos = parse_do_omim_in_do(DO_OMIM_INDO_PATH)
+            self.do_omim_import = parse_do_omim_import(DO_OMIM_IMPORT_PATH)
+
+            if pickle_path:
+                with gzip.open(pickle_path, "wb") as outputf:
+                    logger.info("Pickling...")
+                    pickle.dump(
+                        (
+                            self.gene_xlinks,
+                            self.mim2gene_medgen,
+                            self.hpoa_entries,
+                            self.disease_label_map,
+                            self.orpha_disorders,
+                            self.orpha_mappings,
+                            self.panelapp_associations,
+                            self.mondo_diseases,
+                            self.mondo_unmapped_omim,
+                            self.ctd_diseases,
+                            self.do_unmapped_labels,
+                            self.omim_in_dos,
+                            self.do_omim_import,
+                        ),
+                        outputf,
+                    )
+
+        #: Build gene mappings.
+        self.xlink_by_hgnc_id = {xlink.hgnc_id: xlink for xlink in self.gene_xlinks}
+        self.xlink_by_omim_id = {
+            f"OMIM:{omim_id}": xlink for xlink in self.gene_xlinks for omim_id in xlink.omim_ids
+        }
+
+        # Build mapping from OMIM to title.
+        self.omim_titles = {}
+        for entry in self.hpoa_entries:
+            if entry.database_id.startswith("OMIM:"):
+                self.omim_titles[entry.database_id] = entry.disease_name
+        for label in self.do_unmapped_labels:
+            self.omim_titles.setdefault(label.omim_id, label.label)
+        for omim_in_do in self.omim_in_dos:
+            for xref in omim_in_do.xrefs:
+                if xref.startswith("OMIM:") and xref not in self.omim_titles:
+                    self.omim_titles[xref] = omim_in_do.label
+        for label in self.do_omim_import:
+            if label.omim_id.startswith("OMIM:") and label.omim_id not in self.omim_titles:
+                self.omim_titles[label.omim_id] = label.label
+        for ctd_disease in self.ctd_diseases:
+            for disease_id in chain([ctd_disease.disease_id], ctd_disease.alt_disease_ids):
+                if disease_id.startswith("OMIM:") and disease_id not in self.omim_titles:
+                    self.omim_titles[disease_id] = ctd_disease.disease_name
+        for unmapped_record in self.mondo_unmapped_omim:
+            self.omim_titles.setdefault(unmapped_record.subject_id, unmapped_record.subject_label)
+        for key, value in MANUAL_OMIM_LABELS.items():
+            self.omim_titles.setdefault(key, value)
+
+        # Build ID mappings from MONDO to OMIM and ORPHA.
+        self.mondo_to_omim = {}
+        self.mondo_to_orpha = {}
+        for relation in GOOD_MONDO_RELATIONS:
+            for mondo_disease in self.mondo_diseases:
+                for synonym in mondo_disease.synonyms:
+                    if (
+                        synonym.database_id.startswith("OMIM:")
+                        and synonym.database_id != "OMIM:genemap2"
+                        and relation in synonym.relation
+                        and synonym.database_id not in self.mondo_to_omim
+                    ):
+                        self.mondo_to_omim.setdefault(mondo_disease.mondo_id, set()).add(
+                            synonym.database_id
+                        )
+                    if (
+                        synonym.database_id.startswith("Orphanet:")
+                        and relation in synonym.relation
+                        and synonym.database_id not in self.mondo_to_orpha
+                    ):
+                        self.mondo_to_orpha.setdefault(mondo_disease.mondo_id, set()).add(
+                            synonym.database_id.replace("Orphanet:", "ORPHA:").replace(
+                                "-definition", ""
+                            )
+                        )
+        # Build ID mappings between ORPHA and OMIM.
+        self.orpha_to_omim = {}
+        self.omim_to_orpha = {}
+        for orpha_disorder in self.orpha_disorders:
+            orpha_id = orpha_disorder.database_id
+            for mapping in orpha_disorder.mappings:
+                if (
+                    mapping.database_id.startswith("OMIM:")
+                    and mapping.relation in GOOD_ORPHA_RELATIONS
+                ):
+                    omim_id = mapping.database_id
+                    self.orpha_to_omim.setdefault(orpha_id, set()).add(omim_id)
+                    self.omim_to_orpha.setdefault(omim_id, set()).add(orpha_id)
+
+    def handle_mim2gene_medgen(self):
+        """Process the mim2gene_medgen data."""
+        logger.info("Processing mim2gene_medgen...")
+        # Our primary source for OMIM names is the HPOA table.
+        hpoa_by_disease_id = {
+            entry.database_id: entry for entry in self.hpoa_entries if entry.database_id
+        }
+        # Our second source for OMIM names are files from DO.
+        mondo_by_omim_id: Dict[str, List[MondoDisease]] = {}
+        for relation in GOOD_MONDO_RELATIONS:
+            for mondo_disease in self.mondo_diseases:
+                for synonym in mondo_disease.synonyms:
+                    if (
+                        synonym.database_id.startswith("OMIM:")
+                        and relation in synonym.relation
+                        and synonym.database_id not in mondo_by_omim_id
+                    ):
+                        mondo_by_omim_id.setdefault(synonym.database_id, []).append(mondo_disease)
+        # do_by_disease_id: Dict[str, str] = {
+        #     label.omim_id: label.label for label in self.do_unmapped_labels
+        # }
+        # for omim_in_do in self.omim_in_dos:
+        #     for xref in omim_in_do.xrefs:
+        #         if not xref in do_by_disease_id:
+        #             do_by_disease_id[xref] = omim_in_do.label
+        # for label in self.do_omim_import:
+        #     if label.omim_id not in do_by_disease_id:
+        #         do_by_disease_id[label.omim_id] = label.label
+        # Our third source for OMIM names is the CTD disease database.
+        ctd_by_disease_id: Dict[str, CtdDiseaseEntry] = {}
+        for ctd_disease in self.ctd_diseases:
+            for disease_id in ctd_disease.alt_disease_ids:
+                if disease_id not in ctd_by_disease_id:
+                    ctd_by_disease_id[disease_id] = ctd_disease
+        # If annotation with these names fails, Orphanet terms will be tried later.
+        for record in self.mim2gene_medgen:
+            disease_name = None
+            disease_ids = [record.omim_id]
+            if record.omim_id in hpoa_by_disease_id:
+                disease_name = hpoa_by_disease_id[record.omim_id].disease_name
+            elif record.omim_id in mondo_by_omim_id:
+                mondo_records = mondo_by_omim_id[record.omim_id]
+                disease_name = mondo_records[0].name
+                disease_ids += [record.mondo_id for record in mondo_records]
+            elif record.omim_id in ctd_by_disease_id:
+                disease_name = ctd_by_disease_id[record.omim_id].disease_name
+            elif record.omim_id in self.omim_titles:
+                disease_name = self.omim_titles[record.omim_id]
+            self.register_disease_assoc(
+                GeneDiseaseAssociation(
+                    hgnc_id=record.hgnc_id,
+                    labeled_disorders=[LabeledDisorder(term_id=record.omim_id, title=disease_name)],
+                    disease_ids=list(sorted(set(disease_ids))),
+                    disease_name=disease_name,
+                    disease_definition=None,
+                    sources=[GeneDiseaseAssociationSource.OMIM],
+                    confidence=ConfidenceLevel.HIGH,
+                )
+            )
+
+    def handle_orpha(self):
+        """Process the ORPHA data."""
+        logger.info("Processing ORPHA...")
+        orpha_disorders_by_orpha_id = {d.database_id: d for d in self.orpha_disorders}
+
+        # MONDO mapping for proper aliasing.
+        mondo_by_orpha_id: Dict[str, MondoDisease] = {}
+        for relation in GOOD_MONDO_RELATIONS:
+            for mondo_disease in self.mondo_diseases:
+                for synonym in mondo_disease.synonyms:
+                    if (
+                        synonym.database_id.startswith("Orphanet:")
+                        and relation in synonym.relation
+                        and synonym.database_id not in mondo_by_orpha_id
+                    ):
+                        orpha_id = synonym.database_id.replace("Orphanet:", "ORPHA:")
+                        mondo_by_orpha_id.setdefault(orpha_id, []).append(mondo_disease)
+
+        for orpha_mapping in self.orpha_mappings:
+            database_ids = [orpha_mapping.orpha_id]
+            orpha_disorder = orpha_disorders_by_orpha_id[orpha_mapping.orpha_id]
+            for mapping in orpha_disorder.mappings:
+                if (
+                    mapping.database_id.startswith("OMIM:")
+                    and mapping.relation in GOOD_ORPHA_RELATIONS
+                ):
+                    database_ids.append(mapping.database_id)
+            if orpha_mapping.orpha_id in mondo_by_orpha_id:
+                database_ids += [
+                    record.mondo_id for record in mondo_by_orpha_id[orpha_mapping.orpha_id]
+                ]
+            self.register_disease_assoc(
+                GeneDiseaseAssociation(
+                    hgnc_id=orpha_mapping.hgnc_id,
+                    labeled_disorders=[
+                        LabeledDisorder(
+                            term_id=orpha_mapping.orpha_id, title=orpha_disorder.database_name
+                        )
+                    ],
+                    disease_ids=list(sorted(set(database_ids))),
+                    disease_name=orpha_disorder.database_name,
+                    disease_definition=orpha_disorder.definition,
+                    sources=[GeneDiseaseAssociationSource.ORPHANET],
+                    confidence={
+                        OrphaStatus.ASSESSED.value: ConfidenceLevel.HIGH,
+                        OrphaStatus.NOT_YET_ASSESSED.value: ConfidenceLevel.LOW,
+                    }[orpha_mapping.status],
+                )
+            )
+
+    def handle_panelapp(self):  # noqa: C901
+        """Process the PanelApp data."""
+        RE_OMIM = r"(?:O?MIM:)?(\d\d\d\d\d\d)"
+        RE_MONDO = r"MONDO:(\d+)"
+        RE_ORPHA = r"ORPHA:(\d+)"
+
+        logger.info("Processing PanelApp...")
+        orpha_disorders: Dict[str, OrphaDisorder] = {
+            disease.database_id: disease for disease in self.orpha_disorders
+        }
+        for assoc in self.panelapp_associations:
+            self.panelapp_assocs.setdefault(assoc.hgnc_id, []).append(assoc)
+
+            if assoc.confidence_level == PanelappConfidence.NONE.value:
+                continue
+            if assoc.phenotypes:
+                for phenotype in assoc.phenotypes:
+                    m_omim = re.search(RE_OMIM, phenotype, re.IGNORECASE)
+                    m_mondo = re.search(RE_MONDO, phenotype, re.IGNORECASE)
+                    m_orpha = re.search(RE_ORPHA, phenotype, re.IGNORECASE)
+                    labeled_omim_ids: List[LabeledDisorder] = []
+                    labeled_orpha_ids: List[LabeledDisorder] = []
+                    omim_ids: List[LabeledDisorder] = []
+                    orpha_ids: List[LabeledDisorder] = []
+                    orpha_disorder: Optional[OrphaDisorder] = None
+                    if m_mondo:
+                        mondo_id = f"MONDO:{m_mondo.group(1)}"
+                        omim_ids = list(self.mondo_to_omim.get(mondo_id, []))
+                        orpha_ids = list(sorted(self.mondo_to_orpha.get(mondo_id, set())))
+                        # We base the disease name and description on the first Orpha ID.
+                        # If the association already exists, this one will be ignored.
+                        if orpha_ids:
+                            orpha_disorder = orpha_disorders.get(orpha_ids[0])
+                    elif m_orpha:
+                        orpha_id = f"ORPHA:{m_orpha.group(1)}"
+                        orpha_ids = [orpha_id]
+                        omim_ids = list(self.omim_to_orpha.get(orpha_id, []))
+                        if orpha_ids:
+                            orpha_disorder = orpha_disorders.get(orpha_id)
+                    elif m_omim:
+                        omim_id = f"OMIM:{m_omim.group(1)}"
+                        if omim_id not in self.xlink_by_omim_id:  # OMIM was a gene...
+                            labeled_omim_ids = omim_ids = [omim_id]
+                            orpha_ids = list(sorted(self.omim_to_orpha.get(omim_id, set())))
+                            # We base the disease name and description on the first Orpha ID.
+                            # If the association already exists, this one will be ignored.
+                            if orpha_ids:
+                                orpha_disorder = orpha_disorders.get(orpha_ids[0])
+                        else:
+                            logger.debug(f"Unresolved phenotype {phenotype} (OMIM is gene)")
+                    else:
+                        logger.debug(f"Unresolved phenotype {phenotype}")
+
+                    stripped_phenotype = (
+                        phenotype.replace(f", {omim_id}", "")  # trim ", <OMIM>"
+                        .replace(f" {omim_id}", "")  # trim " <OMIM>"
+                        .replace(omim_id, "")  # trim "<OMIM>"
+                        .replace("{", "")  # trim curly braces in "{xxx}"
+                        .replace("}", "")
+                    )
+
+                    labeled_omim_ids = [
+                        LabeledDisorder(
+                            term_id=omim_id, title=self.omim_titles.get(omim_id, stripped_phenotype)
+                        )
+                        for omim_id in omim_ids
+                    ]
+                    labeled_orpha_ids = [
+                        LabeledDisorder(
+                            term_id=orpha_id, title=orpha_disorders[orpha_id].database_name
+                        )
+                        for orpha_id in orpha_ids
+                    ]
+
+                    if omim_ids or orpha_ids:
+                        # Obtain the disease name from orpha_id/omim_id and lookup.
+                        disease_name = None
+                        for orpha_id in orpha_ids:
+                            if orpha_id in orpha_disorders:
+                                disease_name = orpha_disorders[orpha_id].database_name
+                                break
+                        if not disease_name:
+                            for omim_id in omim_ids:
+                                if omim_id in self.omim_titles:
+                                    disease_name = self.omim_titles[omim_id]
+                                    break
+                        # Use label from PanelApp as fallback.
+                        if not disease_name and m_omim:
+                            disease_name = stripped_phenotype
+                        # We found at least one OMIM/ORPHA ID. We will now created
+                        # a new `GeneDiseaseAssociation` object from this which will
+                        # be merged into any existing.
+                        gene_disease_assoc = GeneDiseaseAssociation(
+                            hgnc_id=assoc.hgnc_id,
+                            labeled_disorders=list(
+                                sorted(set(labeled_omim_ids + labeled_orpha_ids))
+                            ),
+                            disease_ids=list(sorted(set(omim_ids + orpha_ids))),
+                            disease_name=disease_name,
+                            disease_definition=orpha_disorder.definition
+                            if orpha_disorder
+                            else None,
+                            sources=[GeneDiseaseAssociationSource.PANELAPP],
+                            confidence={
+                                PanelappConfidence.GREEN.value: ConfidenceLevel.HIGH,
+                                PanelappConfidence.AMBER.value: ConfidenceLevel.MEDIUM,
+                                PanelappConfidence.RED.value: ConfidenceLevel.LOW,
+                            }[assoc.confidence_level],
+                        )
+                        self.register_disease_assoc(gene_disease_assoc)
+            else:
+                logger.debug(f"UNRESOLVED: {assoc.panel.name}")
+
+
+if __name__ == "__main__":
+    Integrator().run("store.pickle.gz" if DEV_MODE else None)
diff --git a/scripts/genes-orpha-diseases.py b/scripts/genes-orpha-diseases.py
index 75da52d..ad9a426 100755
--- a/scripts/genes-orpha-diseases.py
+++ b/scripts/genes-orpha-diseases.py
@@ -1,38 +1,72 @@
 #!/usr/bin/env python
-"""Helper script to extract gene-disease association from orphapacket."""
+"""Helper script to retrieve ORDO gene-disease associations from OrphaData.
+
+When run in CI mode environment variable "CI" is "true" then we will copy
+the manually created file from excerpts/__orphadata__.  Note that this is
+a deviation from the usual approach of putting the URL into `download_urls`.
+"""
 
-import csv
 import json
-import pathlib
+import os
 import sys
 
+import httpx
+import trio
+from loguru import logger
+
+#: URL for listing ORPHAcode.
+URL_ORPHACODE_LIST = "https://api.orphadata.com/rd-cross-referencing/orphacodes"
+#: URL template for getting information on one ORPHAcode.
+URL_ORPHACODE_GET = "https://api.orphadata.com/rd-cross-referencing/orphacodes/{}?lang=en"
+#: URL template for getting enes on one ORPHAcode.
+URL_ORPHACODE_GET_GENE = "https://api.orphadata.com/rd-associated-genes/orphacodes/{}"
+
+#: Whether we run in test mode.
+RUNS_IN_CI = os.environ.get("CI", "false") == "true"
+
+
+async def main():
+    async with httpx.AsyncClient() as client:
+        logger.info("Fetching ORPHAcode list...")
+        lst = await client.get(URL_ORPHACODE_LIST)
+        logger.info("...done")
+        disease_ids = {disease["ORPHAcode"] for disease in lst.json()["data"]["results"]}
+
+    async def work(no: int, orpha_id: int, limiter: trio.CapacityLimiter):
+        async with limiter:
+            async with httpx.AsyncClient() as client:
+                try:
+                    cross_references = (await client.get(URL_ORPHACODE_GET.format(orpha_id))).json()
+                    disease_genes = (
+                        await client.get(URL_ORPHACODE_GET_GENE.format(orpha_id), timeout=60)
+                    ).json()
+                except Exception as e:
+                    logger.error(f"Error fetching {orpha_id}: {e}")
+                    raise
+                finally:
+                    if no % 100 == 0:
+                        logger.info(f"done fetching ORPHAcode details {no}/{len(disease_ids)}")
+        json.dump(
+            {
+                "orpha_id": f"ORPHA:{orpha_id}",
+                "cross_references": cross_references["data"]["results"],
+                "disease_genes": disease_genes.get("data", {}).get("results"),
+            },
+            sys.stdout,
+        )
+        sys.stdout.write("\n")
 
-def main():
-    symbol_to_hgnc = {}
-    with open(sys.argv[1], "rt") as inputf:
-        reader = csv.DictReader(inputf, delimiter="\t")
-        for record in reader:
-            symbol_to_hgnc[record["gene_symbol"]] = record["hgnc_id"]
-
-    print(f"# xlink entries: {len(symbol_to_hgnc)}", file=sys.stderr)
-
-    base_path = pathlib.Path(sys.argv[2])
-    print("\t".join(["hgnc_id", "orpha_id", "disease_name"]))
-    for json_path in sorted(base_path.glob("*.json")):
-        with json_path.open("rt") as inputf:
-            data = json.load(inputf)
-            elem_top = data["Orphapacket"]
-            if elem_top.get("DisorderType", {}).get("value") != "Disease":
-                continue  # skip categories
-            disease_name = elem_top["Label"]
-            orpha_id = elem_top["PURL"].replace("http://www.orpha.net/ORDO/Orphanet_", "ORPHA:")
-            elem_genes = elem_top.get("Genes", [])
-            for elem_gene in elem_genes:
-                gene_symbol = elem_gene["Gene"]["Symbol"]
-                hgnc_id = symbol_to_hgnc.get(gene_symbol)
-                if hgnc_id:  # skip if no HGNC ID exists, maybe withdrawn?
-                    print("\t".join(map(str, [hgnc_id, orpha_id, disease_name])))
+    logger.info("Fetching ORPHAcode details...")
+    limiter = trio.CapacityLimiter(10)
+    async with trio.open_nursery() as nursery:
+        for no, disease_id in enumerate(disease_ids):
+            nursery.start_soon(work, no, disease_id, limiter)
+    logger.info("...done")
 
 
 if __name__ == "__main__":
-    main()
+    if RUNS_IN_CI:
+        with open("excerpt-data/__orphadata__/orphadata.jsonl", "rt") as inputf:
+            sys.stdout.write(inputf.read())
+    else:
+        trio.run(main)
diff --git a/scripts/genes-xlink-hgnc.jq b/scripts/genes-xlink-hgnc.jq
index 271e579..e6be805 100644
--- a/scripts/genes-xlink-hgnc.jq
+++ b/scripts/genes-xlink-hgnc.jq
@@ -1,7 +1,7 @@
 # Create gene identifier interlink table.
 
 # Create header
-["hgnc_id", "ensembl_gene_id", "entrez_id", "gene_symbol"],
+["hgnc_id", "ensembl_gene_id", "entrez_id", "gene_symbol", "omim_ids"],
 # Then, process the file
 (
     # For each entry in .response.docs
@@ -12,7 +12,8 @@
             .hgnc_id // "",
             .ensembl_gene_id // "",
             .entrez_id // "",
-            .symbol // ""
+            .symbol // "",
+            ((.omim_id // []) | join(","))
         ]
 )
 # Convert everything to TSV
diff --git a/varfish_db_downloader/versions.py b/varfish_db_downloader/versions.py
index 4ec6af1..1176104 100644
--- a/varfish_db_downloader/versions.py
+++ b/varfish_db_downloader/versions.py
@@ -102,8 +102,8 @@ class DataVersions:
     acmg_sf: str
     #: HPO
     hpo: str
-    #: OrphaPacket
-    orphapacket: str
+    #: Orphadata
+    orphadata: str
     #: Pathogenic MMS
     patho_mms: str
     #: Mehari transcript data.
@@ -161,8 +161,8 @@ class DataVersions:
     refseq_38="GCF_000001405.40+RS_2023_03",
     dbsnp="b151",
     acmg_sf="3.1",
-    hpo="20230606",
-    orphapacket="10.1",
+    hpo="20240116",
+    orphadata=TODAY,
     patho_mms="20220730",
     mehari_tx="0.4.4",
     clinvar_release=CLINVAR_RELEASE,