diff --git a/.gitignore b/.gitignore index d306f04..982b6d3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Ignore all pickled data +*.pickle* + # Ignore the workflow directories. /work/ /output/ diff --git a/Snakefile b/Snakefile index 43eacca..9c8a888 100644 --- a/Snakefile +++ b/Snakefile @@ -86,8 +86,9 @@ rule all: # # genes f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz", - f"work/download/genes/orphapacket/{DV.orphapacket}/orphapacket.tar.gz", "work/download/genes/alphamissense/1/AlphaMissense_gene_hg38.tsv.gz", + f"work/download/genes/ctd/{DV.today}/CTD_diseases.tsv.gz", + f"work/download/do/{DV.today}/omim-unmapped.csv", f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz", "work/genes/decipher/v3/decipher_hi_prediction.tsv.gz", f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv", @@ -96,7 +97,8 @@ rule all: f"work/genes/gnomad/{DV.gnomad_constraints}/gnomad_constraints.tsv", f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl", f"work/genes/omim/{DV.hpo}+{DV.today}/omim_diseases.tsv", - f"work/genes/orphapacket/{DV.orphapacket}+{DV.today}/orpha_diseases.tsv", + f"work/genes/orphadata/{DV.orphadata}/orphadata.jsonl", + f"work/genes/mondo/{DV.today}/mondo.obo", "work/genes/rcnv/2022/rcnv_collins_2022.tsv", "work/genes/shet/2019/shet_weghorn_2019.tsv", f"work/genes/clingen/{DV.today}/ClinGen_gene_curation_list_GRCh37.tsv", @@ -177,7 +179,7 @@ rule all: f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY", # ----- genes - f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.orphapacket}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY", + f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY", # -- worker data f"output/full/worker/genes-regions-grch37-{DV.refseq_37}+{PV.worker}/refseq_genes.bin", f"output/full/worker/genes-regions-grch37-{DV.ensembl_37}+{PV.worker}/ensembl_genes.bin", @@ -341,6 +343,9 @@ include: "rules/work/misc/hpo.smk" include: "rules/work/genes/alphamissense.smk" include: "rules/work/genes/dbnsfp.smk" include: "rules/work/genes/clingen.smk" +include: "rules/work/genes/conditions.smk" +include: "rules/work/genes/ctd.smk" +include: "rules/work/genes/do.smk" include: "rules/work/genes/decipher.smk" include: "rules/work/genes/ensembl.smk" include: "rules/work/genes/gnomad.smk" @@ -350,11 +355,11 @@ include: "rules/work/genes/mehari_data_tx.smk" include: "rules/work/genes/ncbi.smk" include: "rules/work/genes/omim.smk" include: "rules/work/genes/panelapp.smk" -include: "rules/work/genes/orphapacket.smk" +include: "rules/work/genes/mondo.smk" +include: "rules/work/genes/orphadata.smk" include: "rules/work/genes/rcnv.smk" include: "rules/work/genes/shet.smk" include: "rules/work/genes/domino.smk" -include: "rules/work/genes/clingen.smk" # Reference sequence--related rules. include: "rules/work/reference/human.smk" # Features (position and not variant specific). diff --git a/download_urls.yml b/download_urls.yml index a4f0af8..4b3ecea 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -1,3 +1,48 @@ +# Note that this just tests the availability of the OrphaData API. We have a file in +# excerpts/__orphadata__ that is used by `genes-orpha-diseases.py` in CI=true mode. +- url: https://api.orphadata.com/rd-cross-referencing/orphacodes + excerpt_strategy: + strategy: no-excerpt + count: null +- url: https://api.orphadata.com/rd-cross-referencing/orphacodes/20?lang=en + excerpt_strategy: + strategy: no-excerpt + count: null +- url: https://api.orphadata.com/rd-associated-genes/orphacodes/20 + excerpt_strategy: + strategy: no-excerpt + count: null + +- url: https://raw.githubusercontent.com/monarch-initiative/mondo-ingest/main/src/ontology/reports/omim_unmapped_terms.tsv + excerpt_strategy: + strategy: no-excerpt + count: null + +- url: https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/DO_NON_Production_Files/omim_import.obo + excerpt_strategy: + strategy: no-excerpt + count: null + +- url: https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/DOreports/OMIMinDO.tsv + excerpt_strategy: + strategy: no-excerpt + count: null + +- url: https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/reports/omim-unmapped.csv + excerpt_strategy: + strategy: no-excerpt + count: null + +- url: https://ctdbase.org/reports/CTD_diseases.tsv.gz + excerpt_strategy: + strategy: no-excerpt + count: null + +- url: http://purl.obolibrary.org/obo/mondo.obo + excerpt_strategy: + strategy: no-excerpt + count: null + - url: https://panelapp.genomicsengland.co.uk/api/v1/entities/ excerpt_strategy: strategy: no-excerpt @@ -101,7 +146,7 @@ strategy: manual count: null -- url: https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz +- url: https://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv excerpt_strategy: strategy: no-excerpt count: null @@ -130,19 +175,19 @@ strategy: no-excerpt count: null -- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/hp.obo +- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/hp.obo excerpt_strategy: strategy: no-excerpt count: null -- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/phenotype.hpoa +- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/phenotype.hpoa excerpt_strategy: strategy: no-excerpt count: null -- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/phenotype_to_genes.txt +- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/phenotype_to_genes.txt excerpt_strategy: strategy: no-excerpt count: null -- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2023-06-06/genes_to_phenotype.txt +- url: https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-01-16/genes_to_phenotype.txt excerpt_strategy: strategy: no-excerpt count: null diff --git a/environment.yml b/environment.yml index 1f42695..5e552bb 100644 --- a/environment.yml +++ b/environment.yml @@ -12,6 +12,8 @@ dependencies: - click - loguru - numpy + - pydantic + - pronto >=2.5,<3.0 - pyyaml - requests - requests-ftp @@ -41,9 +43,13 @@ dependencies: # Parallel (de)compression. - pigz # Varfish related - - annonars =0.33.0 + - annonars =0.34.0 - viguno =0.2.0 - mehari =0.21.1 - varfish-server-worker =0.10.2 # S3 uploads - s5cmd =2.1.0 + # async HTTP requests + - httpx =0.25.0 + - httpcore =0.18.0 + - trio diff --git a/excerpt-data/111d8c6e08038f62/20 b/excerpt-data/111d8c6e08038f62/20 new file mode 100644 index 0000000..862d894 --- /dev/null +++ b/excerpt-data/111d8c6e08038f62/20 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b610d631a11a2f6f4b93b8f7f7458447fa1a974ade2a18b1b9c9b5047dac516c +size 3338 diff --git a/excerpt-data/111d8c6e08038f62/url.txt b/excerpt-data/111d8c6e08038f62/url.txt new file mode 100644 index 0000000..2a496ff --- /dev/null +++ b/excerpt-data/111d8c6e08038f62/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bda92b632aaeacd032c57d87e6420ccac3945bcd78bf51bafc117da4eb63a601 +size 69 diff --git a/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv b/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv new file mode 100644 index 0000000..94d4f19 --- /dev/null +++ b/excerpt-data/1963f3c58ea066be/omim_unmapped_terms.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384eccb1d84e8a5036027cd340d2b4b18ce41170de28407f3022d7f3dd2392dc +size 4779 diff --git a/excerpt-data/1963f3c58ea066be/url.txt b/excerpt-data/1963f3c58ea066be/url.txt new file mode 100644 index 0000000..af13e07 --- /dev/null +++ b/excerpt-data/1963f3c58ea066be/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2153c0f0281a0399c77b0964d96c1286f6716eac34304f68f4e838fe41a02bdd +size 116 diff --git a/excerpt-data/32c97f6adaf88f01/hp.obo b/excerpt-data/32c97f6adaf88f01/hp.obo new file mode 100644 index 0000000..3e8f702 --- /dev/null +++ b/excerpt-data/32c97f6adaf88f01/hp.obo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508bb0326603671327e9f4af05beffd1800e053557c7c1a0d137fd1f458bb0f7 +size 9719364 diff --git a/excerpt-data/32c97f6adaf88f01/url.txt b/excerpt-data/32c97f6adaf88f01/url.txt new file mode 100644 index 0000000..0f60dc3 --- /dev/null +++ b/excerpt-data/32c97f6adaf88f01/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b057fd342e531569cd4c2af122986569a44bc55348bcbc8104866b9ea696acfb +size 94 diff --git a/excerpt-data/366878c8178827e3/phenotype_to_genes.txt b/excerpt-data/366878c8178827e3/phenotype_to_genes.txt deleted file mode 100644 index 4308961..0000000 --- a/excerpt-data/366878c8178827e3/phenotype_to_genes.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6d9749cfe77403a5133dd86ef2419eae75ceb86b4ac7df1af2872e635d4e77a -size 58253258 diff --git a/excerpt-data/366878c8178827e3/url.txt b/excerpt-data/366878c8178827e3/url.txt deleted file mode 100644 index 6e90764..0000000 --- a/excerpt-data/366878c8178827e3/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9894daafe63bf29b0462ecfe2937aefb11387dd3fbf2070ed1e53b0e37cc4fb9 -size 110 diff --git a/excerpt-data/41961c7350780224/phenotype.hpoa b/excerpt-data/41961c7350780224/phenotype.hpoa new file mode 100644 index 0000000..ed13380 --- /dev/null +++ b/excerpt-data/41961c7350780224/phenotype.hpoa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f10bb697f53d1767335f8f7390dc85ec659d78b946abd1a31decd2058b8ffe9 +size 31795534 diff --git a/excerpt-data/41961c7350780224/url.txt b/excerpt-data/41961c7350780224/url.txt new file mode 100644 index 0000000..2a81f51 --- /dev/null +++ b/excerpt-data/41961c7350780224/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40157925175644167a768330abaa318c6cc3e961d0463e8a3ee45b77513d4dd4 +size 102 diff --git a/excerpt-data/5dfdf97f46a7c299/hp.obo b/excerpt-data/5dfdf97f46a7c299/hp.obo deleted file mode 100644 index d9da674..0000000 --- a/excerpt-data/5dfdf97f46a7c299/hp.obo +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6fa2f31ade75580469a996be3f2ebbcd3812b6ae75b3b02820a201bb1c4736c9 -size 9222726 diff --git a/excerpt-data/5dfdf97f46a7c299/url.txt b/excerpt-data/5dfdf97f46a7c299/url.txt deleted file mode 100644 index 17e6803..0000000 --- a/excerpt-data/5dfdf97f46a7c299/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2da8d523a44e41522c6bd46943f8cd36d4e73c22388846174372a412cf59f495 -size 94 diff --git a/excerpt-data/6045c008f1c0f370/url.txt b/excerpt-data/6045c008f1c0f370/url.txt deleted file mode 100644 index 17796bd..0000000 --- a/excerpt-data/6045c008f1c0f370/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58b5f890e962de336ac264c788ae9136ad1eb69f52c1f536379c2984b85beef1 -size 71 diff --git a/excerpt-data/6045c008f1c0f370/v10.1.tar.gz b/excerpt-data/6045c008f1c0f370/v10.1.tar.gz deleted file mode 100644 index 4779579..0000000 --- a/excerpt-data/6045c008f1c0f370/v10.1.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:38de2709b2f9a9c4e0b9ec18c5ebae337a47e97897808069dcee7ba5c39d3224 -size 1503498 diff --git a/excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv b/excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv new file mode 100644 index 0000000..08eb9cd --- /dev/null +++ b/excerpt-data/615312ce3f5fc1bf/OMIMinDO.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82d6abe489be419dd0bebcd8e3da518e8fd6b6bf4cfd1a2670df8bdc7a81d71 +size 355299 diff --git a/excerpt-data/615312ce3f5fc1bf/url.txt b/excerpt-data/615312ce3f5fc1bf/url.txt new file mode 100644 index 0000000..b3b97a9 --- /dev/null +++ b/excerpt-data/615312ce3f5fc1bf/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e5bc9bbc38d8ec6e3514ee71e383ce3a355dcf77b49682cf1d46bffe86bd15f +size 99 diff --git a/excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz b/excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz new file mode 100644 index 0000000..d879265 --- /dev/null +++ b/excerpt-data/617bebe58c82f24e/CTD_diseases.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e37e0f2cae2f4e6ca6f26ab47508c0ed461b44870fbb7b69cecaf0a9e5c5db1d +size 1778371 diff --git a/excerpt-data/617bebe58c82f24e/url.txt b/excerpt-data/617bebe58c82f24e/url.txt new file mode 100644 index 0000000..9c4a699 --- /dev/null +++ b/excerpt-data/617bebe58c82f24e/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d4d842332879379d1bfffb8f9238d62a95a0b59d8ae0063195cc62c62ad940 +size 48 diff --git a/excerpt-data/652646c24140df2a/mondo.obo b/excerpt-data/652646c24140df2a/mondo.obo new file mode 100644 index 0000000..9c120a6 --- /dev/null +++ b/excerpt-data/652646c24140df2a/mondo.obo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce8ebbfdb594fe32335e6c26c6f10d0c085da5f47e74bf4fbef2a15690b6fe8 +size 33100807 diff --git a/excerpt-data/652646c24140df2a/url.txt b/excerpt-data/652646c24140df2a/url.txt new file mode 100644 index 0000000..82d4fa2 --- /dev/null +++ b/excerpt-data/652646c24140df2a/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a0983bc3cc220a3d3709f27ecd4524035666eb95fe984d61276b0dcb275f882 +size 41 diff --git a/excerpt-data/6f378db589a4bbb9/orphacodes b/excerpt-data/6f378db589a4bbb9/orphacodes new file mode 100644 index 0000000..f3dac85 --- /dev/null +++ b/excerpt-data/6f378db589a4bbb9/orphacodes @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:848ee5fb05bf6c5431aff29eab7ad1bacbc5cbc984cdecba4f5672a30a9ac1e7 +size 1242966 diff --git a/excerpt-data/6f378db589a4bbb9/url.txt b/excerpt-data/6f378db589a4bbb9/url.txt new file mode 100644 index 0000000..228bcdc --- /dev/null +++ b/excerpt-data/6f378db589a4bbb9/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd4412fd56abffcedb9d340f8ac92cdee271bcc5c3e535418384c286675ff0b4 +size 58 diff --git a/excerpt-data/91f964d1aa8367a5/20 b/excerpt-data/91f964d1aa8367a5/20 new file mode 100644 index 0000000..fc06abb --- /dev/null +++ b/excerpt-data/91f964d1aa8367a5/20 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec854b1c8f931b18f156c340a1a8ec46509f9207f81c2a04b0b3a519d9b84a87 +size 2191 diff --git a/excerpt-data/91f964d1aa8367a5/url.txt b/excerpt-data/91f964d1aa8367a5/url.txt new file mode 100644 index 0000000..8cfa63d --- /dev/null +++ b/excerpt-data/91f964d1aa8367a5/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5530926f47d8db160ef199d1836317edc614b80fb5138893cfc93ce349f19f90 +size 60 diff --git a/excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt b/excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt new file mode 100644 index 0000000..90d9d53 --- /dev/null +++ b/excerpt-data/97187ff23d7d2773/phenotype_to_genes.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38caf0ffede297f84dc2d41a238828683957f5baa72a6cc226661865d81c562d +size 60158078 diff --git a/excerpt-data/97187ff23d7d2773/url.txt b/excerpt-data/97187ff23d7d2773/url.txt new file mode 100644 index 0000000..3e40490 --- /dev/null +++ b/excerpt-data/97187ff23d7d2773/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed40c33b60c6ba43a4c15d7184aab42bce54bd1d88f29a4b8fa754add577d4c +size 110 diff --git a/excerpt-data/__orphadata__/orphadata.jsonl b/excerpt-data/__orphadata__/orphadata.jsonl new file mode 100644 index 0000000..eebd9c2 --- /dev/null +++ b/excerpt-data/__orphadata__/orphadata.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71c7f9a3df098e467957e4dcbcc8bf78b5713b11e9e3060ea0f67437dca0cb71 +size 25839170 diff --git a/excerpt-data/a0f9f11118d32143/download b/excerpt-data/a0f9f11118d32143/download new file mode 100644 index 0000000..759232c --- /dev/null +++ b/excerpt-data/a0f9f11118d32143/download @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d4f7b8049f2238d04b0453d2422d7499fd7196162027b7a3b4f2f91ca9b16a2 +size 2509937 diff --git a/excerpt-data/a0f9f11118d32143/url.txt b/excerpt-data/a0f9f11118d32143/url.txt new file mode 100644 index 0000000..b8b04dd --- /dev/null +++ b/excerpt-data/a0f9f11118d32143/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa28fe18f2378e43812405874df6a303db980507aea705278cbc047d03b3d38c +size 118 diff --git a/excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt b/excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt new file mode 100644 index 0000000..797ff46 --- /dev/null +++ b/excerpt-data/ac92d419f271d95f/genes_to_phenotype.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32f95376f4e987fa28397948d492203a5a34950ef22e6c798ac79d1f0919addd +size 18998203 diff --git a/excerpt-data/ac92d419f271d95f/url.txt b/excerpt-data/ac92d419f271d95f/url.txt new file mode 100644 index 0000000..882caf4 --- /dev/null +++ b/excerpt-data/ac92d419f271d95f/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3e534bb315ac040576f817d5badb3f20b9ccbd43b8d72ff4bb512747efa77e2 +size 110 diff --git a/excerpt-data/afba4597f963e400/phenotype.hpoa b/excerpt-data/afba4597f963e400/phenotype.hpoa deleted file mode 100644 index 56d35a6..0000000 --- a/excerpt-data/afba4597f963e400/phenotype.hpoa +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27483a513d23276d21799803390b6aebc3194242a9996e7225262133a8f2ff7c -size 31413071 diff --git a/excerpt-data/afba4597f963e400/url.txt b/excerpt-data/afba4597f963e400/url.txt deleted file mode 100644 index 6e8bd6d..0000000 --- a/excerpt-data/afba4597f963e400/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd376b656093f1bd4778a32bae021cd22ec6e282b8737a143964cc4c26d2fd84 -size 102 diff --git a/excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt b/excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt deleted file mode 100644 index ea9f224..0000000 --- a/excerpt-data/e3f85776a4d3a44b/genes_to_phenotype.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47f2f26a9767921165beb34f11d00bf6bc4fb1815613d4433403df68f2b6c52c -size 18182540 diff --git a/excerpt-data/e3f85776a4d3a44b/url.txt b/excerpt-data/e3f85776a4d3a44b/url.txt deleted file mode 100644 index 6223501..0000000 --- a/excerpt-data/e3f85776a4d3a44b/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17384bdd61a8c02121400083229f431a6ab2f8bcceb1b0cebbf1a2e802a7933d -size 110 diff --git a/excerpt-data/e4097d7046a53998/omim_import.obo b/excerpt-data/e4097d7046a53998/omim_import.obo new file mode 100644 index 0000000..9c8c55a --- /dev/null +++ b/excerpt-data/e4097d7046a53998/omim_import.obo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aacf57a9ef7d737f08915c0160b2174453eaa71061ee47753febb353daee12e +size 9631131 diff --git a/excerpt-data/e4097d7046a53998/url.txt b/excerpt-data/e4097d7046a53998/url.txt new file mode 100644 index 0000000..c8e55db --- /dev/null +++ b/excerpt-data/e4097d7046a53998/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be34d29606ea9f6ad67d5d7c777d963b46c024a85ab2709d4f66c46b326ea704 +size 120 diff --git a/excerpt-data/ede624c2cd588939/omim-unmapped.csv b/excerpt-data/ede624c2cd588939/omim-unmapped.csv new file mode 100644 index 0000000..5f421b3 --- /dev/null +++ b/excerpt-data/ede624c2cd588939/omim-unmapped.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36bed4998159bf50c84fa2a39f79210723c63953586fb5ab01f11327c03db741 +size 278605 diff --git a/excerpt-data/ede624c2cd588939/url.txt b/excerpt-data/ede624c2cd588939/url.txt new file mode 100644 index 0000000..2740d94 --- /dev/null +++ b/excerpt-data/ede624c2cd588939/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd7010cd3fca806cade1ebb0ed577a723f197b48b486bf631c0e71dc2f3f523 +size 106 diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk index b461a34..4471bc5 100644 --- a/rules/output/annonars/genes.smk +++ b/rules/output/annonars/genes.smk @@ -11,8 +11,9 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl", ncbi="work/genes/entrez/{date}/gene_info.jsonl", omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv", - orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv", + orpha="work/genes/orphadata/{date}/orpha_diseases.tsv", panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl", + conditions="work/genes/conditions/{v_hpo}+{date}/conditions.jsonl", rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv", shet="work/genes/shet/2019/shet_weghorn_2019.tsv", gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz", @@ -20,11 +21,11 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file decipher_hi="work/genes/decipher/v3/decipher_hi_prediction.tsv.gz", output: rocksdb_identity=( - "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/" + "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/" "rocksdb/IDENTITY" ), spec_yaml=( - "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{v_orpha}+{date}+{v_annonars}/" + "output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/" "spec.yaml" ), wildcard_constraints: @@ -35,11 +36,17 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file v_annonars=RE_VERSION, shell: r""" + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + annonars gene import \ --path-out-rocksdb $(dirname {output.rocksdb_identity}) \ --path-in-acmg {input.acmg_sf} \ --path-in-clingen-37 {input.clingen_37} \ --path-in-clingen-38 {input.clingen_38} \ + --path-in-conditions {input.conditions} \ --path-in-gnomad-constraints {input.gnomad_constraints} \ --path-in-dbnsfp {input.dbnsfp} \ --path-in-hgnc {input.hgnc} \ @@ -64,7 +71,6 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file --value date={wildcards.date} \ \ --value v_annonars={wildcards.v_annonars} \ - --value v_downloader={PV.downloader} \ - --value v_orphapacket={wildcards.v_orpha} \ + --value v_downloader={PV.downloader} > {output.spec_yaml} """ diff --git a/rules/output/annonars/genes.spec.yaml b/rules/output/annonars/genes.spec.yaml index e4cd860..21b050e 100644 --- a/rules/output/annonars/genes.spec.yaml +++ b/rules/output/annonars/genes.spec.yaml @@ -38,7 +38,7 @@ x-created-from: - name: OMIM version: {{ today }} - name: ORDO - version: {{ v_orphapacket }} + version: {{ v_ordo }} - name: rCNV pHaplo/pTriplo scores version: 2022-Collins-et-al - name: sHet scores diff --git a/rules/output/worker/hgnc.smk b/rules/output/worker/hgnc.smk index 9ff2ddc..1babd54 100644 --- a/rules/output/worker/hgnc.smk +++ b/rules/output/worker/hgnc.smk @@ -9,6 +9,11 @@ rule output_hgnc_xlink_binary: spec_yaml=f"output/full/worker/genes-xlink-{{date}}+{PV.worker}/genes-xlink.spec.yaml", shell: r""" + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + varfish-server-worker db to-bin \ --input-type xlink \ --path-input {input.tsv} \ diff --git a/rules/work/genes/conditions.smk b/rules/work/genes/conditions.smk new file mode 100644 index 0000000..2f65175 --- /dev/null +++ b/rules/work/genes/conditions.smk @@ -0,0 +1,39 @@ +## Rules for aggregating per-gene conditions data for annonars. + + +rule work_conditions_integrate: # integrate conditions file + input: + mim2gene_medgen="work/download/genes/ncbi/{date}/mim2gene_medgen", + genes_xlink="output/full/mehari/genes-xlink-{date}/genes-xlink.tsv", + hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa", + orpha_jsonl="work/genes/orphadata/{date}/orphadata.jsonl", + panelapp_jsonl="work/download/genes/panelapp/{date}/panelapp.jsonl", + mondo_obo="work/genes/mondo/{date}/mondo.obo", + mondo_unmapped_tsv="work/genes/mondo/{date}/omim_unmapped_terms.tsv", + ctd_tsv="work/download/genes/ctd/{date}/CTD_diseases.tsv.gz", + do_omim_unmapped="work/download/do/{date}/omim-unmapped.csv", + do_omim_indo="work/download/do/{date}/OMIMinDO.tsv", + do_omim_import="work/download/do/{date}/omim_import.obo", + output: + jsonl="work/genes/conditions/{v_hpo}+{date}/conditions.jsonl", + shell: + r""" + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + + MIM2GENE_MEDGEN_PATH="{input.mim2gene_medgen}" \ + GENES_XLINK_PATH="{input.genes_xlink}" \ + HPOA_PATH="{input.hpoa}" \ + ORPHA_JSONL_PATH="{input.orpha_jsonl}" \ + PANELAPP_JSONL_PATH="{input.panelapp_jsonl}" \ + MONDO_OBO_PATH="{input.mondo_obo}" \ + MONDO_UNMAPPED_OMIM_PATH="{input.mondo_unmapped_tsv}" \ + CTD_PATH="{input.ctd_tsv}" \ + DO_OMIM_UNMAPPED_PATH="{input.do_omim_unmapped}" \ + DO_OMIM_INDO_PATH="{input.do_omim_indo}" \ + DO_OMIM_IMPORT_PATH="{input.do_omim_import}" \ + python scripts/genes-integrate-diseases.py \ + > {output.jsonl} + """ diff --git a/rules/work/genes/ctd.smk b/rules/work/genes/ctd.smk new file mode 100644 index 0000000..688a250 --- /dev/null +++ b/rules/work/genes/ctd.smk @@ -0,0 +1,15 @@ +## Rules related to CTD download + + +rule genes_ctd_download: # -- download CTD + output: + tsv="work/download/genes/ctd/{date}/CTD_diseases.tsv.gz", + shell: + """ + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + + wget -O {output.tsv} https://ctdbase.org/reports/CTD_diseases.tsv.gz + """ diff --git a/rules/work/genes/decipher.smk b/rules/work/genes/decipher.smk index 56b51a7..5537679 100644 --- a/rules/work/genes/decipher.smk +++ b/rules/work/genes/decipher.smk @@ -38,19 +38,20 @@ rule genes_decipher_hi_convert: # -- convert DECIPHER HI predictions to TSV | sed -e 's/%$//g' \ >> $TMPDIR/tmp.tsv - qsv join \ - gene_symbol {input.hgnc} \ + qsv select -d $'\t' '!omim_ids' {input.hgnc} \ + | tr ',' '\t' \ + > $TMPDIR/tmp3.tsv + + qsv join -d $'\t' \ + gene_symbol $TMPDIR/tmp3.tsv \ gene_symbol $TMPDIR/tmp.tsv \ - > $TMPDIR/tmp2.tsv - - ( \ - echo -e "hgnc_id\thgnc_symbol\tp_hi\thi_index"; \ - tail -n +2 $TMPDIR/tmp2.tsv \ - | tr ',' '\t' \ - | cut -f 1,5-7 \ - | LC_ALL=C sort \ - ) \ - | gzip -c \ + > $TMPDIR/tmp2.csv + + qsv select \ + 'hgnc_id,gene_symbol,p_hi,hi_index' \ + $TMPDIR/tmp2.csv \ + | qsv rename 'hgnc_id,hgnc_symbol,p_hi,hi_index' \ + | tr ',' '\t' \ > {output.tsv} md5sum {output.tsv} > {output.tsv_md5} diff --git a/rules/work/genes/do.smk b/rules/work/genes/do.smk new file mode 100644 index 0000000..a9f1627 --- /dev/null +++ b/rules/work/genes/do.smk @@ -0,0 +1,24 @@ +## Rules related to DO download + + +rule work_disease_ontology_download: # -- download DO files + output: + unmapped_csv="work/download/do/{date}/omim-unmapped.csv", + omimindo_tsv="work/download/do/{date}/OMIMinDO.tsv", + omim_import="work/download/do/{date}/omim_import.obo", + shell: + r""" + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + + wget -O {output.unmapped_csv} \ + https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/reports/omim-unmapped.csv + wget -O {output.omimindo_tsv} \ + https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/DOreports/OMIMinDO.tsv + wget -O {output.omim_import} \ + https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/main/src/deprecated/DO_NON_Production_Files/omim_import.obo + perl -p -i -e 's/(is_a:[^!]*?),.*/\1/g' {output.omim_import} + perl -p -i -e 's/OMIM:PS/OMIM:/g' {output.omim_import} + """ diff --git a/rules/work/genes/mondo.smk b/rules/work/genes/mondo.smk new file mode 100644 index 0000000..9d67396 --- /dev/null +++ b/rules/work/genes/mondo.smk @@ -0,0 +1,14 @@ +## Rules related to MONDO data + + +rule genes_mondo_download: # -- download orphadatas + output: + obo="work/genes/mondo/{version}/mondo.obo", + omim_unmapped="work/genes/mondo/{version}/omim_unmapped_terms.tsv", + shell: + """ + wget -O {output.obo} \ + http://purl.obolibrary.org/obo/mondo.obo + wget -O {output.omim_unmapped} \ + https://raw.githubusercontent.com/monarch-initiative/mondo-ingest/main/src/ontology/reports/omim_unmapped_terms.tsv + """ diff --git a/rules/work/genes/omim.smk b/rules/work/genes/omim.smk index d33136e..bb726ee 100644 --- a/rules/work/genes/omim.smk +++ b/rules/work/genes/omim.smk @@ -12,8 +12,13 @@ rule genes_omim: # -- prepare HGNC to OMIM disease mapping """ set -x + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + export TMPDIR=$(mktemp -d) - # trap "rm -rf $TMPDIR" ERR EXIT + trap "rm -rf $TMPDIR" ERR EXIT head -n 1 {input.mim2gene} | sed -e 's/ /_/g' -e 's/#//g' \ > $TMPDIR/mim2gene.tsv diff --git a/rules/work/genes/orphadata.smk b/rules/work/genes/orphadata.smk new file mode 100644 index 0000000..0414e91 --- /dev/null +++ b/rules/work/genes/orphadata.smk @@ -0,0 +1,23 @@ +## Rules related to Orphadata download + + +rule genes_orphadata_download: # -- download orphadatas + output: + jsonl="work/genes/orphadata/{date}/orphadata.jsonl", + jsonl_md5="work/genes/orphadata/{date}/orphadata.jsonl.md5", + tsv="work/genes/orphadata/{date}/orpha_diseases.tsv", # xxx + shell: + """ + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + + python ./scripts/genes-orpha-diseases.py \ + > {output.jsonl} + + md5sum {output.jsonl} > {output.jsonl}.md5 + + echo -e "hgnc_id\torpha_id\tassoc_status\tomim_ids\tdisease_name\tdefinition" \ + > {output.tsv} + """ diff --git a/rules/work/genes/orphapacket.smk b/rules/work/genes/orphapacket.smk deleted file mode 100644 index b5201a5..0000000 --- a/rules/work/genes/orphapacket.smk +++ /dev/null @@ -1,34 +0,0 @@ -## Rules related to annotating genes with ORDO terms - - -rule genes_orphapacket_download: # -- download orphapacket file - output: - tar="work/download/genes/orphapacket/{version}/orphapacket.tar.gz", - shell: - r""" - wget -O {output.tar} \ - https://github.com/Orphanet/orphapacket/archive/refs/tags/v10.1.tar.gz - """ - - -rule genes_orphapacket_diseases: # -- postprocess file for HGNC gene IDs - input: - tar="work/download/genes/orphapacket/{version}/orphapacket.tar.gz", - xlink="output/full/mehari/genes-xlink-{date}/genes-xlink.tsv", - output: - tsv="work/genes/orphapacket/{version}+{date}/orpha_diseases.tsv", - tsv_md5="work/genes/orphapacket/{version}+{date}/orpha_diseases.tsv.md5", - shell: - """ - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" ERR EXIT - - tar -C $TMPDIR -xf $(readlink -f {input.tar}) - - python ./scripts/genes-orpha-diseases.py {input.xlink} $TMPDIR/orphapacket-*/json \ - | qsv sort -d '\t' \ - | qsv fmt -t '\t' \ - > {output.tsv} - - md5sum {output.tsv} > {output.tsv}.md5 - """ diff --git a/scripts/genes-integrate-diseases.py b/scripts/genes-integrate-diseases.py new file mode 100644 index 0000000..7e975ef --- /dev/null +++ b/scripts/genes-integrate-diseases.py @@ -0,0 +1,1321 @@ +"""Create integrated gene to disease map.""" + +import csv +import enum +import gzip +import json +import os +import pickle +import re +import sys +from itertools import chain +from typing import Dict, List, Optional, Set, Tuple + +import pronto +from loguru import logger +from pydantic import BaseModel, ConfigDict, Field + +#: Path to HGNC:ID to OMIM ID mapping file. +MIM2GENE_MEDGEN_PATH = os.environ["MIM2GENE_MEDGEN_PATH"] +#: Path to genes xlink file. +GENES_XLINK_PATH = os.environ["GENES_XLINK_PATH"] +#: Path to HPOA file. +HPOA_PATH = os.environ["HPOA_PATH"] +#: Path to the ORPHA JSONL file. +ORPHA_JSONL_PATH = os.environ["ORPHA_JSONL_PATH"] +#: Path to the PanelApp JSONL file. +PANELAPP_JSONL_PATH = os.environ["PANELAPP_JSONL_PATH"] +#: Path to the MONDO OBO file. +MONDO_OBO_PATH = os.environ["MONDO_OBO_PATH"] +#: Path to the MONDO unmapped OMIM file. +MONDO_UNMAPPED_OMIM_PATH = os.environ["MONDO_UNMAPPED_OMIM_PATH"] +#: Path to CTD file. +CTD_PATH = os.environ["CTD_PATH"] +#: DO OMIM "ummapped" file. +DO_OMIM_UNMAPPED_PATH = os.environ["DO_OMIM_UNMAPPED_PATH"] +#: DO OMIMinDO file. +DO_OMIM_INDO_PATH = os.environ["DO_OMIM_INDO_PATH"] +#: DO legacy OMIM import file. +DO_OMIM_IMPORT_PATH = os.environ["DO_OMIM_IMPORT_PATH"] + +#: Development mode => pickling of data +DEV_MODE = os.environ.get("DEV_MODE", "0") == "1" + + +class GeneXlink(BaseModel): + """Cross-reference of gene.""" + + #: HGNC ID. + hgnc_id: str + #: ENSEMBL gene ID. + ensembl_gene_id: Optional[str] + #: Entrez gene ID. + entrez_id: Optional[str] + #: Gene symbol + gene_symbol: str + #: OMIM IDs + omim_ids: List[str] + + +def parse_gene_xlink(path: str) -> List[GeneXlink]: + """Parse gene cross-reference file.""" + rows = [] + with open(path) as f: + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + for key in ("ensembl_gene_id", "entrez_id"): + if row[key] == "": + row[key] = None + if not row["omim_ids"]: + row["omim_ids"] = [] + else: + row["omim_ids"] = row["omim_ids"].split(",") + rows.append(GeneXlink(**row)) + return rows + + +class Mim2geneMedgen(BaseModel): + """Mapping from OMIM ID to gene.""" + + model_config = ConfigDict(frozen=True) + + #: OMIM ID. + omim_id: str + #: Gene HGNC ID. + hgnc_id: str + + def __lt__(self, other: "Mim2geneMedgen") -> bool: + """Compare mim2gene_medgen entries.""" + return (self.hgnc_id, self.omim_id) < (other.hgnc_id, other.omim_id) + + +def parse_mim2gene_medgen( + path: str, xlink_by_entrez_id: Dict[str, GeneXlink] +) -> List[Mim2geneMedgen]: + """Parse mim2gene_medgen file.""" + result = [] + with open(path, "rt") as inputf: + reader = csv.DictReader(inputf, delimiter="\t") + for row in reader: + if row["GeneID"] == "-" or row["type"] != "phenotype": + continue + if row["GeneID"] not in xlink_by_entrez_id: + logger.warning(f"Skipping NCBI Gene ID = {row['GeneID']}") + continue + result.append( + Mim2geneMedgen( + omim_id=f"OMIM:{row['#MIM number']}", + hgnc_id=xlink_by_entrez_id[row["GeneID"]].hgnc_id, + ) + ) + return list(sorted(set(result))) + + +@enum.unique +class Evidence(enum.Enum): + """Evidence from HPOA record""" + + #: Has been inferred by parsing OMIM. + INFERRED_FROM_ELECTRONIC_ANNOTATION = "IEA" + #: Extracted from articles in medical literature. + PUBLISHED_CLINICAL_STUDY = "PCS" + #: Extracted from knowledge bases such as OMIM and Orphanet. + TRACEABLE_AUTHOR_STATEMENT = "TAS" + + +@enum.unique +class Sex(enum.Enum): + """Enumeration for sex.""" + + #: Male + MALE = "MALE" + #: Female + FEMALE = "FEMALE" + + +@enum.unique +class Aspect(enum.Enum): + """Enumeration for aspect.""" + + #: Phenotype + PHENOTYPE = "P" + #: Mode of inheritance + MODE_OF_INHERITANCE = "I" + #: Clinical course + CLINICAL_COURSE = "C" + #: Modifier + CLINICAL_MODIFIER = "M" + + +class HpoaEntry(BaseModel): + """Disease-phenotype association from HPOA file.""" + + model_config = ConfigDict(use_enum_values=True) + + #: Term database ID. + database_id: str + #: Term database name. + disease_name: str + #: Whether is a positive or negative association. + qualifier: bool + #: HPO ID. + hpo_id: str + #: Evidence for this association. + reference: List[str] + #: Evidence for this association. + evidence: Evidence + #: HPO term for onset. + onset: str + #: Frequency annotation. + frequency: str + #: Sex link, if any. + sex: Optional[Sex] + #: HPO terms of modifiers. + modifier: List[str] + #: Aspect of this association. + aspect: Aspect + #: Biocuration notes. + biocuration: List[str] + + +def parse_hpoa(path: str) -> List[HpoaEntry]: + """Parse HPOA file.""" + rows = [] + with open(path) as f: + for i in range(4): + next(f) + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + row["qualifier"] = row["qualifier"] != "NOT" + row["reference"] = row["reference"].split(";") + row["evidence"] = Evidence(row["evidence"]) + if row["sex"]: + row["sex"] = Sex(row["sex"].upper()) + else: + row["sex"] = None + row["modifier"] = row["modifier"].split(";") + row["aspect"] = Aspect(row["aspect"]) + row["biocuration"] = row["biocuration"].split(";") + rows.append(HpoaEntry(**row)) + return rows + + +class DiseaseNameLabel(BaseModel): + """Association of a diasease database ID to its name.""" + + #: Disease database ID. + database_id: str + #: Name in the database. + disease_name: str + + +def hpoa_entries_to_disease_label_map(hpoa_entries: List[HpoaEntry]) -> List[DiseaseNameLabel]: + """Create a map of disease database ID to disease name.""" + disease_label_map = {} + for entry in hpoa_entries: + disease_label_map[entry.database_id] = DiseaseNameLabel( + database_id=entry.database_id, disease_name=entry.disease_name + ) + return disease_label_map + + +@enum.unique +class DisorderMappingRelation(enum.Enum): + """Characterisation of disorder mapping relation.""" + + #: Exact mapping + EXACT = "E" + #: ORPHA code narrower term maps to a broader term + NTBT = "NTBT" + #: ORPHA code broader term maps to a narrower term + BTNT = "BTNT" + #: ORPHA code narrower term maps to a broader term because of an exact + #: match with an synonym in the target terminology. + NTBT_E = "NTBT/E" + #: ORPHA code broader term maps to a narrower term because of an exact + #: match with an synonym in the target terminology. + BTNT_E = "BTNT/E" + #: Not yet decied / unable to decide. + ND = "ND" + + +class OrphaDisorderMapping(BaseModel): + """Mapping into a external/target disorder terminology.""" + + model_config = ConfigDict(use_enum_values=True) + + #: Identifier of source terminology with prefix of the terminology. + database_id: str + #: The relationship between ORPHA and the target terminology. + relation: DisorderMappingRelation + + +class OrphaDisorder(BaseModel): + """Description of the ORPHA disorder.""" + + #: The "ORPHA:" ID. + database_id: str + #: The name of the disorder. + database_name: str + #: Definition, if provided. + definition: Optional[str] + #: Synonym list. + synonyms: List[str] + #: Mappings in external terminologies. + mappings: List[OrphaDisorderMapping] + + +@enum.unique +class OrphaStatus(enum.Enum): + """ORPHA assessment status.""" + + #: Assessed + ASSESSED = "Assessed" + #: Not yet assessed + NOT_YET_ASSESSED = "Not yet assessed" + + +@enum.unique +class OrphaAssocationType(enum.Enum): + """ORPHA association type.""" + + ROLE = "Role in the phenotype of" + GERMLINE_LOF = "Disease-causing germline mutation(s) (loss of function) in" + GERMLINE_GOF = "Disease-causing germline mutation(s) (gain of function) in" + GERMLINE = "Disease-causing germline mutation(s) in" + SOMATIC = "Disease-causing somatic mutation(s) in" + MODIFYING_GERMLINE = "Modifying germline mutation in" + FUSION_GENE = "Part of a fusion gene in" + SUSCEPTIBILITY = "Major susceptibility factor in" + CANDIDATE_GENE = "Candidate gene tested in" + BIOMARKER = "Biomarker tested in" + + +class OrphaGeneMapping(BaseModel): + """Mapping from an ORPHA disorder to a gene.""" + + model_config = ConfigDict(use_enum_values=True) + + #: ORPHA ID. + orpha_id: str + #: Gene HGNC id. + hgnc_id: str + #: Assessment status. + status: OrphaStatus + #: The type of the assessment. + association_type: OrphaAssocationType + + +def parse_orpha_jsonl( + path: str, xlink_by_gene_symbol: Dict[str, GeneXlink] +) -> Tuple[List[OrphaDisorder], List[OrphaGeneMapping]]: + """Parse ORPHA JSONL file.""" + disorders = [] + gene_mappings = [] + with open(path) as inputf: + for line in inputf: + record = json.loads(line) + # Extract disease details and cross-references to other terminologies. + disorder_mappings = [] + for mapping in record["cross_references"].get("ExternalReference") or []: + disorder_mappings.append( + OrphaDisorderMapping( + database_id=f"{mapping['Source']}:{mapping['Reference']}", + relation=DisorderMappingRelation( + mapping["DisorderMappingRelation"].split(" ")[0] + ), + ) + ) + summary_information = record["cross_references"].get("SummaryInformation") or [] + disorder = OrphaDisorder( + database_id=record["orpha_id"], + database_name=record["cross_references"]["Preferred term"], + definition=summary_information[0].get("Definition") + if summary_information + else None, + synonyms=record["cross_references"].get("Synonym") or [], + mappings=disorder_mappings, + ) + disorders.append(disorder) + # Extract gene mappings. + for assoc in (record.get("disease_genes") or {}).get("DisorderGeneAssociation") or []: + hgnc_id = None + for ext_ref in assoc["Gene"]["ExternalReference"] or []: + if ext_ref["Source"] == "HGNC": + hgnc_id = f"HGNC:{ext_ref['Reference']}" + if hgnc_id is None: + if assoc["Gene"]["Symbol"] not in xlink_by_gene_symbol: + logger.info(f"Skipping GeneType = {assoc['Gene']['GeneType']}") + continue + else: + hgnc_id = xlink_by_gene_symbol[assoc["Gene"]["Symbol"]].hgnc_id + gene_mappings.append( + OrphaGeneMapping( + orpha_id=record["orpha_id"], + hgnc_id=hgnc_id, + status=OrphaStatus(assoc["DisorderGeneAssociationStatus"]), + association_type=OrphaAssocationType(assoc["DisorderGeneAssociationType"]), + ) + ) + return disorders, gene_mappings + + +@enum.unique +class PanelappConfidence(enum.Enum): + """Confidence level for PanelApp.""" + + #: Green + GREEN = "GREEN" + #: Amber + AMBER = "AMBER" + #: Red + RED = "RED" + #: None + NONE = "NONE" + + def __lt__(self, other: "PanelappConfidence") -> bool: + """Compare confidence levels. + + GREEN is smallest for sorting ergnonomics. + """ + mapping = { + PanelappConfidence.GREEN: 0, + PanelappConfidence.AMBER: 1, + PanelappConfidence.RED: 2, + PanelappConfidence.NONE: 3, + } + return mapping[self] < mapping[other] + + +@enum.unique +class PanelappEntityType(enum.Enum): + """Entity type for PanelApp.""" + + #: Gene + GENE = "GENE" + #: Region + REGION = "REGION" + #: STR + STR = "STR" + + +class PanelappPanel(BaseModel): + """Information about a PanelApp panel.""" + + #: Numeric ID + id: int + #: Name of the panel. + name: str + #: Version of the panel. + version: str + + +class PanelappAssociation(BaseModel): + """A gene-to-disease association from PanelApp.""" + + model_config = ConfigDict(use_enum_values=True) + + #: Gene HGNC ID. + hgnc_id: str + #: Confidence level. + confidence_level: PanelappConfidence + #: Entity type. + entity_type: PanelappEntityType + #: Mode of inheritance. + mode_of_inheritance: Optional[str] + #: The phenotypes as list of strings. + phenotypes: List[str] + #: Information about the panel that is the source. + panel: PanelappPanel + + +def parse_panelapp_jsonl( + path: str, xlink_by_gene_symbol: Dict[str, GeneXlink] +) -> List[PanelappAssociation]: + entity_type_mapping = { + "gene": PanelappEntityType.GENE.value, + "region": PanelappEntityType.REGION.value, + "str": PanelappEntityType.STR.value, + } + confidence_level_mapping = { + "3": PanelappConfidence.GREEN.value, + "2": PanelappConfidence.AMBER.value, + "1": PanelappConfidence.RED.value, + "0": PanelappConfidence.NONE.value, + } + + result = [] + with open(path, "rt") as inputf: + for line in inputf: + record = json.loads(line) + entity_type = PanelappEntityType(entity_type_mapping[record["entity_type"]]) + if entity_type not in [PanelappEntityType.GENE, PanelappEntityType.STR]: + continue + hgnc_id = record["gene_data"].get("hgnc_id") + if not hgnc_id: + hgnc_id = getattr( + xlink_by_gene_symbol.get(record["gene_data"]["gene_symbol"]), "hgnc_id", None + ) + if not hgnc_id: + logger.warn( + f"Skipping record without HGNC ID and unmappable gene symbol: {record['gene_data']['gene_symbol']}" + ) + continue + record["confidence_level"] = confidence_level_mapping[record["confidence_level"]] + assoc = PanelappAssociation( + hgnc_id=hgnc_id, + confidence_level=PanelappConfidence(record["confidence_level"]), + mode_of_inheritance=record["mode_of_inheritance"], + entity_type=entity_type, + phenotypes=record["phenotypes"], + panel=PanelappPanel( + id=record["panel"]["id"], + name=record["panel"]["name"], + version=record["panel"]["version"], + ), + ) + if assoc.confidence_level != PanelappConfidence.NONE: + result.append(assoc) + return result + + +@enum.unique +class MondoDiseaseRelation(enum.Enum): + """Characterisation of disorder mapping relation.""" + + #: Exact. + EXACT = "EXACT" + #: Related. + RELATED = "RELATED" + #: Broad. + BROAD = "BROAD" + #: Narrow. + NARROW = "NARROW" + #: Abbreviation. + ABBREVIATION = "ABBREVIATION" + #: Ambiguous. + AMBIGUOUS = "AMBIGUOUS" + #: Deprecated. + DEPRECATED = "DEPRECATED" + #: Dubious. + DUBIOUS = "DUBIOUS" + #: Exclude. + EXCLUDE = "EXCLUDE" + #: Misspelling. + MISSPELLING = "MISSPELLING" + #: Non-human. + NON_HUMAN = "NON-HUMAN" + #: UK spelling synonym. + OMO_0003005 = "OMO:0003005" + + +class MondoSynonym(BaseModel): + """Mapping into another terminology.""" + + model_config = ConfigDict(use_enum_values=True) + + #: Identifier of source terminology with prefix of the terminology. + database_id: str + #: The relationship between ORPHA and the target terminology. + relation: List[MondoDiseaseRelation] + + +class MondoDisease(BaseModel): + """Relevant information from a MONDO disease.""" + + #: The MONDO term ID. + mondo_id: str + #: The name of the disease. + name: str + #: A list of synonyms. + synonyms: List[MondoSynonym] + + +def parse_mondo_obo(path: str) -> List[MondoDisease]: + """Parse MONDO OBO file.""" + result = [] + mondo = pronto.Ontology(path) + for term in mondo.terms(): + synonyms = [] + for synonym in term.synonyms: + for xref in synonym.xrefs: + synonyms.append( + MondoSynonym( + database_id=xref.id, + relation=list(map(MondoDiseaseRelation, synonym.scope.split(" "))), + ) + ) + result.append(MondoDisease(mondo_id=term.id, name=term.name, synonyms=synonyms)) + return result + + +class MondoUnmappedOmim(BaseModel): + """Model for the MONDO unmapped OMIM file.""" + + subject_id: str + subject_label: str + + +def parse_mondo_unmapped_omim(path: str) -> List[MondoUnmappedOmim]: + """Parse MONDO unmapped OMIM file.""" + result = [] + with open(path) as inputf: + for line in inputf: + if line.startswith("subject_id"): + pass + record = line.rstrip("\n").split("\t") + result.append(MondoUnmappedOmim(subject_id=record[0], subject_label=record[1])) + return result + + +@enum.unique +class ConfidenceLevel(enum.Enum): + """Confidence level for gene-disease association.""" + + #: High confidence. + #: + #: This corresponds to a match in OMIM, an "Assessed" entry in Orphanet, or a PanelApp entry + #: with GREEN color. + HIGH = "HIGH" + #: Medium confidence. + #: + #: This corresponds to a PanelApp entry with AMBER color. + MEDIUM = "MEDIUM" + #: Low confidence. + #: + #: This corresponds to a "Not yet assessed" entry in Orphanet or a PanelApp entry with RED + #: color. + LOW = "LOW" + + def __lt__(self, other: "ConfidenceLevel") -> bool: + """Compare confidence levels. + + HIGH is smallest for sorting ergnonomics. + """ + mapping = { + ConfidenceLevel.HIGH: 3, + ConfidenceLevel.MEDIUM: 2, + ConfidenceLevel.LOW: 1, + } + return mapping[self] < mapping[other] + + +@enum.unique +class GeneDiseaseAssociationSource(enum.Enum): + """Enumeration for gene-disease association source.""" + + #: OMIM. + OMIM = "OMIM" + #: Orphanet. + ORPHANET = "ORPHANET" + #: PanelApp. + PANELAPP = "PANELAPP" + + def __lt__(self, other: "ConfidenceLevel") -> bool: + """Compare confidence levels.""" + return self.value < other.value + + +class GeneDiseaseAssociationEntry(BaseModel): + """A source gene-disease association entry.""" + + #: The source. + source: GeneDiseaseAssociationSource + #: The confidence level. + confidence: ConfidenceLevel + + +class LabeledDisorder(BaseModel): + """Disorder identifier with a label.""" + + model_config = ConfigDict(frozen=True) + + #: ID of the disorder. + term_id: str + #: Label of the disorder. + title: Optional[str] + + def __lt__(self, other: "LabeledDisorder") -> bool: + """Compare labeled disorders.""" + return self.term_id < other.term_id + + +class GeneDiseaseAssociation(BaseModel): + """Association of a gene to a disease.""" + + model_config = ConfigDict(frozen=True) + + #: Gene HGNC ID. + hgnc_id: str + #: List of primary disease identifiers that diseases came from. + labeled_disorders: Tuple[LabeledDisorder, ...] + #: List of disease identifiers, used for clustering. + disease_ids: Tuple[str, ...] = Field(..., exclude=True) + #: Disease name for display, if any. + disease_name: Optional[str] + #: Disease definition from Orphanet, if available. + disease_definition: Optional[str] + #: The overall supporting sources. + sources: Tuple[GeneDiseaseAssociationSource, ...] + #: The overall confidence level. + confidence: ConfidenceLevel + + def merge(self, other: "GeneDiseaseAssociation") -> "GeneDiseaseAssociation": + if self.hgnc_id != other.hgnc_id: + raise RuntimeError("Cannot merge different genes.") + labeled_disease_ids = tuple( + sorted(set(chain(self.labeled_disorders, other.labeled_disorders))) + ) + disease_ids = tuple(sorted(set(chain(self.disease_ids, other.disease_ids)))) + disease_name = self.disease_name or other.disease_name + disease_definition = self.disease_definition or other.disease_definition + sources = tuple(sorted(set(chain(self.sources, other.sources)))) + if self.confidence < other.confidence: + confidence = other.confidence + else: + confidence = self.confidence + return GeneDiseaseAssociation( + hgnc_id=self.hgnc_id, + labeled_disorders=labeled_disease_ids, + disease_ids=disease_ids, + disease_name=disease_name, + disease_definition=disease_definition, + sources=sources, + confidence=confidence, + ) + + +class CtdDiseaseEntry(BaseModel): + """Entry in CTD disease database.""" + + model_config = ConfigDict(frozen=True) + + disease_name: str + disease_id: str + alt_disease_ids: List[str] + definition: str + parent_ids: List[str] + tree_numbers: List[str] + parent_tree_numbers: List[str] + synonyms: List[str] + slim_mappings: List[str] + + +def parse_ctd_disease_tsv(path: str) -> List[CtdDiseaseEntry]: + """Parse CTD disease database TSV file.""" + result = [] + with gzip.open(path, "rt") as inputf: + for line in inputf: + if line.startswith("#"): + continue + record = line.rstrip("\n").split("\t") + result.append( + CtdDiseaseEntry( + disease_name=record[0], + disease_id=record[1], + alt_disease_ids=record[2].split("|"), + definition=record[3], + parent_ids=record[4].split("|"), + tree_numbers=record[5].split("|"), + parent_tree_numbers=record[6].split("|"), + synonyms=record[7].split("|"), + slim_mappings=record[8].split("|"), + ) + ) + return result + + +class DoLabel(BaseModel): + """Label for a DO term.""" + + #: The OMIM id. + omim_id: str + #: The label. + label: str + + +def parse_do_omim_unmapped(path: str) -> List[DoLabel]: + """Parse DO OMIM unmapped file.""" + result = [] + first = True + with open(path) as inputf: + for line in inputf: + record = line.rstrip("\n").split(",") + if first: + first = False + else: + result.append(DoLabel(omim_id=record[0], label=record[1])) + return result + + +class OmimInDo(BaseModel): + """Entry in OMIMinDO file.""" + + model_config = ConfigDict(frozen=True) + + id: str + label: str + xrefs: List[str] + + +def parse_do_omim_in_do(path: str) -> List[OmimInDo]: + result = [] + with open(path, "rt") as inputf: + reader = csv.DictReader(inputf, delimiter="\t", escapechar="\\") + for row in reader: + row["xrefs"] = [xref for xref in row["xrefs"].split(", ") if xref != "OMIM:genemap2"] + result.append(OmimInDo(**row)) + return result + + +def parse_do_omim_import(path: str) -> List[DoLabel]: + result = [] + omim_import = pronto.Ontology(path) + for term in omim_import.terms(): + if term.name: + result.append(DoLabel(omim_id=term.id, label=term.name)) + return result + + +class GeneConditionAssociation(BaseModel): + """Diseases and PanelApp information linked to a gene.""" + + #: Gene HGNC ID. + hgnc_id: str + #: The gene disease association + disease_associations: Tuple[GeneDiseaseAssociation, ...] + #: PanelApp panels with related genes. + panelapp_associations: Tuple[PanelappAssociation, ...] + + +class ResultContainer(BaseModel): + """Container for results.""" + + #: The results. + results: Tuple[GeneConditionAssociation, ...] + + def __lt__(self, other: "ResultContainer") -> bool: + """Compare two result containers.""" + return self.results < other.results + + +#: Some manual OMIM labels that are on MONDO exclusion lists etc. +MANUAL_OMIM_LABELS = { + "OMIM:300932": "Thyroxine-binding globulin QTL", + "OMIM:615018": "Sd(a) polyagglutination syndrome", + "OMIM:617956": "Beta-glycopyranoside tasting", + "OMIM:617966": "Low density lipoprotein cholesterol level QTL 7", + "OMIM:617995": "IMPDH2 enzyme activity, variation in", + "OMIM:618079": "Low density lipoprotein cholesterol level QTL 8", + "OMIM:618807": "LPA deficiency, congenital", + "OMIM:619812": "Blood group, EMM system", + "OMIM:620116": "Fatty liver disease, protection from", + "OMIM:620207": "ER blood group system", +} + + +#: MondoDiseaseRelations that are considered to be "good enough" mappings. +GOOD_MONDO_RELATIONS = ( + MondoDiseaseRelation.EXACT.value, + MondoDiseaseRelation.NARROW.value, + MondoDiseaseRelation.BROAD.value, + MondoDiseaseRelation.RELATED.value, +) +#: DisorderMappingRelation that are considered to be "good enough" mappings. +GOOD_ORPHA_RELATIONS = ( + DisorderMappingRelation.EXACT.value, + DisorderMappingRelation.BTNT.value, # ? + DisorderMappingRelation.BTNT_E.value, + DisorderMappingRelation.NTBT.value, + DisorderMappingRelation.NTBT_E.value, # ? +) + + +class GeneDiseaseKey(BaseModel): + """Key for a gene-disease association.""" + + model_config = ConfigDict(frozen=True) + + #: Gene HGNC ID. + hgnc_id: str + #: Disease database ID. + disease_id: str + + +class Integrator: + """Implementation of the integration algorithm.""" + + # data loaded / pickled + + gene_xlinks: List[GeneXlink] + xlink_by_hgnc_id: Dict[str, GeneXlink] + xlink_by_gene_symbol: Dict[str, GeneXlink] + xlink_by_entrez_id: Dict[str, GeneXlink] + xlink_by_omim_id: Dict[str, GeneXlink] + mim2gene_medgen: List[Mim2geneMedgen] + hpoa_entries: List[HpoaEntry] + disease_label_map: Dict[str, DiseaseNameLabel] + orpha_disorders: List[OrphaDisorder] + orpha_mappings: List[OrphaGeneMapping] + panelapp_associations: List[PanelappAssociation] + mondo_diseases: List[MondoDisease] + mondo_unmapped_omim: List[MondoUnmappedOmim] + ctd_diseases: List[CtdDiseaseEntry] + do_unmapped_labels: List[DoLabel] + omim_in_dos: List[OmimInDo] + + # other data + + #: OMIM titles + omim_titles: Dict[str, str] + #: Mappings from MONDO to ORPHA + mondo_to_orpha: Dict[str, Set[str]] + #: Mappings from MONDO to OMIM + mondo_to_omim: Dict[str, Set[str]] + #: Mappings from ORPHA to OMIM + orpha_to_omim: Dict[str, Set[str]] + #: Mappings from OMIM to ORPHA + omim_to_orpha: Dict[str, Set[str]] + + def __init__(self): + """Initialise the integrator.""" + #: Mapping from `(gene_hgnc_id, disease_id)` to `GeneDiseaseAssociationEntry`. + self.disease_assocs: Dict[GeneDiseaseKey, GeneDiseaseAssociation] = {} + #: Mapping from `hgnc_id` to list of `PanelappAssociation`s. + self.panelapp_assocs: Dict[str, List[PanelappAssociation]] = {} + + def register_disease_assoc(self, assoc: GeneDiseaseAssociation): + """Register a gene-disease association.""" + found_list = set() + for disease_id in assoc.disease_ids: + key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id) + if key in self.disease_assocs: + found_list.add(self.disease_assocs[key]) + if not found_list: + for disease_id in assoc.disease_ids: + key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id) + self.disease_assocs[key] = assoc + else: + if len(found_list) != 1: + logger.warning(f"Found multiple associations for {assoc.hgnc_id}") + for found in found_list: + found = found.merge(assoc) + for disease_id in assoc.disease_ids: + key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id) + self.disease_assocs[key] = found + + def run(self, pickle_path: Optional[str] = None): + logger.info("Building gene-disease map...") + self.load_data(pickle_path) + self.handle_orpha() + self.handle_mim2gene_medgen() + self.handle_panelapp() + + conditions_by_hgnc = { + hgnc_id: GeneConditionAssociation( + hgnc_id=hgnc_id, disease_associations=[], panelapp_associations=[] + ) + for hgnc_id in sorted( + set( + chain( + (k.hgnc_id for k in self.disease_assocs.keys()), self.panelapp_assocs.keys() + ) + ) + ) + } + for hgnc_id, assocs in self.panelapp_assocs.items(): + conditions_by_hgnc[hgnc_id] = conditions_by_hgnc[hgnc_id].model_copy( + update={ + "panelapp_associations": tuple(sorted(assocs, key=lambda a: a.confidence_level)) + } + ) + for key, assoc in self.disease_assocs.items(): + disease_assoc = conditions_by_hgnc[key.hgnc_id].disease_associations + conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy( + update={ + "disease_associations": tuple( + sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence) + ) + } + ) + result = ResultContainer(results=tuple(conditions_by_hgnc.values())) + for assoc in result.results: + json.dump(obj=assoc.model_dump(mode="json"), fp=sys.stdout) + sys.stdout.write("\n") + logger.info("All done. Have a nice day.") + + def load_data(self, pickle_path: Optional[str] = None): # noqa: C901 + if pickle_path and os.path.exists(pickle_path): + logger.info("unpickling...") + with gzip.open(pickle_path, "rb") as inputf: + ( + self.gene_xlinks, + self.mim2gene_medgen, + self.hpoa_entries, + self.disease_label_map, + self.orpha_disorders, + self.orpha_mappings, + self.panelapp_associations, + self.mondo_diseases, + self.mondo_unmapped_omim, + self.ctd_diseases, + self.do_unmapped_labels, + self.omim_in_dos, + self.do_omim_import, + ) = pickle.load(inputf) + self.xlink_by_entrez_id = { + xlink.entrez_id: xlink for xlink in self.gene_xlinks if xlink.entrez_id + } + self.xlink_by_gene_symbol = {xlink.gene_symbol: xlink for xlink in self.gene_xlinks} + else: + logger.info("Parsing genes xlink file...") + self.gene_xlinks = parse_gene_xlink(GENES_XLINK_PATH) + self.xlink_by_entrez_id = { + xlink.entrez_id: xlink for xlink in self.gene_xlinks if xlink.entrez_id + } + logger.info("Parsing mim2gene_medgen file...") + self.mim2gene_medgen = parse_mim2gene_medgen( + MIM2GENE_MEDGEN_PATH, self.xlink_by_entrez_id + ) + logger.info("Parsing HPOA file...") + self.hpoa_entries = parse_hpoa(HPOA_PATH) + logger.info("Converting to disease label map...") + self.disease_label_map = hpoa_entries_to_disease_label_map(self.hpoa_entries) + # print(list(disease_label_map.items())[:10]) + logger.info("Parsing ORPHA JSONL file...") + self.xlink_by_gene_symbol = {xlink.gene_symbol: xlink for xlink in self.gene_xlinks} + self.orpha_disorders, self.orpha_mappings = parse_orpha_jsonl( + ORPHA_JSONL_PATH, self.xlink_by_gene_symbol + ) + # print(json.dumps([d.model_dump() for d in disorders[:10]], indent=2)) + logger.info("Parsing PanelApp JSONL file...") + self.panelapp_associations = parse_panelapp_jsonl( + PANELAPP_JSONL_PATH, self.xlink_by_gene_symbol + ) + + logger.info("Loading MONDO files...") + self.mondo_diseases = parse_mondo_obo(MONDO_OBO_PATH) + self.mondo_unmapped_omim = parse_mondo_unmapped_omim(MONDO_UNMAPPED_OMIM_PATH) + logger.info("Loading CTD disease file...") + self.ctd_diseases = parse_ctd_disease_tsv(CTD_PATH) + logger.info("Loading DO files...") + self.do_unmapped_labels = parse_do_omim_unmapped(DO_OMIM_UNMAPPED_PATH) + self.omim_in_dos = parse_do_omim_in_do(DO_OMIM_INDO_PATH) + self.do_omim_import = parse_do_omim_import(DO_OMIM_IMPORT_PATH) + + if pickle_path: + with gzip.open(pickle_path, "wb") as outputf: + logger.info("Pickling...") + pickle.dump( + ( + self.gene_xlinks, + self.mim2gene_medgen, + self.hpoa_entries, + self.disease_label_map, + self.orpha_disorders, + self.orpha_mappings, + self.panelapp_associations, + self.mondo_diseases, + self.mondo_unmapped_omim, + self.ctd_diseases, + self.do_unmapped_labels, + self.omim_in_dos, + self.do_omim_import, + ), + outputf, + ) + + #: Build gene mappings. + self.xlink_by_hgnc_id = {xlink.hgnc_id: xlink for xlink in self.gene_xlinks} + self.xlink_by_omim_id = { + f"OMIM:{omim_id}": xlink for xlink in self.gene_xlinks for omim_id in xlink.omim_ids + } + + # Build mapping from OMIM to title. + self.omim_titles = {} + for entry in self.hpoa_entries: + if entry.database_id.startswith("OMIM:"): + self.omim_titles[entry.database_id] = entry.disease_name + for label in self.do_unmapped_labels: + self.omim_titles.setdefault(label.omim_id, label.label) + for omim_in_do in self.omim_in_dos: + for xref in omim_in_do.xrefs: + if xref.startswith("OMIM:") and xref not in self.omim_titles: + self.omim_titles[xref] = omim_in_do.label + for label in self.do_omim_import: + if label.omim_id.startswith("OMIM:") and label.omim_id not in self.omim_titles: + self.omim_titles[label.omim_id] = label.label + for ctd_disease in self.ctd_diseases: + for disease_id in chain([ctd_disease.disease_id], ctd_disease.alt_disease_ids): + if disease_id.startswith("OMIM:") and disease_id not in self.omim_titles: + self.omim_titles[disease_id] = ctd_disease.disease_name + for unmapped_record in self.mondo_unmapped_omim: + self.omim_titles.setdefault(unmapped_record.subject_id, unmapped_record.subject_label) + for key, value in MANUAL_OMIM_LABELS.items(): + self.omim_titles.setdefault(key, value) + + # Build ID mappings from MONDO to OMIM and ORPHA. + self.mondo_to_omim = {} + self.mondo_to_orpha = {} + for relation in GOOD_MONDO_RELATIONS: + for mondo_disease in self.mondo_diseases: + for synonym in mondo_disease.synonyms: + if ( + synonym.database_id.startswith("OMIM:") + and synonym.database_id != "OMIM:genemap2" + and relation in synonym.relation + and synonym.database_id not in self.mondo_to_omim + ): + self.mondo_to_omim.setdefault(mondo_disease.mondo_id, set()).add( + synonym.database_id + ) + if ( + synonym.database_id.startswith("Orphanet:") + and relation in synonym.relation + and synonym.database_id not in self.mondo_to_orpha + ): + self.mondo_to_orpha.setdefault(mondo_disease.mondo_id, set()).add( + synonym.database_id.replace("Orphanet:", "ORPHA:").replace( + "-definition", "" + ) + ) + # Build ID mappings between ORPHA and OMIM. + self.orpha_to_omim = {} + self.omim_to_orpha = {} + for orpha_disorder in self.orpha_disorders: + orpha_id = orpha_disorder.database_id + for mapping in orpha_disorder.mappings: + if ( + mapping.database_id.startswith("OMIM:") + and mapping.relation in GOOD_ORPHA_RELATIONS + ): + omim_id = mapping.database_id + self.orpha_to_omim.setdefault(orpha_id, set()).add(omim_id) + self.omim_to_orpha.setdefault(omim_id, set()).add(orpha_id) + + def handle_mim2gene_medgen(self): + """Process the mim2gene_medgen data.""" + logger.info("Processing mim2gene_medgen...") + # Our primary source for OMIM names is the HPOA table. + hpoa_by_disease_id = { + entry.database_id: entry for entry in self.hpoa_entries if entry.database_id + } + # Our second source for OMIM names are files from DO. + mondo_by_omim_id: Dict[str, List[MondoDisease]] = {} + for relation in GOOD_MONDO_RELATIONS: + for mondo_disease in self.mondo_diseases: + for synonym in mondo_disease.synonyms: + if ( + synonym.database_id.startswith("OMIM:") + and relation in synonym.relation + and synonym.database_id not in mondo_by_omim_id + ): + mondo_by_omim_id.setdefault(synonym.database_id, []).append(mondo_disease) + # do_by_disease_id: Dict[str, str] = { + # label.omim_id: label.label for label in self.do_unmapped_labels + # } + # for omim_in_do in self.omim_in_dos: + # for xref in omim_in_do.xrefs: + # if not xref in do_by_disease_id: + # do_by_disease_id[xref] = omim_in_do.label + # for label in self.do_omim_import: + # if label.omim_id not in do_by_disease_id: + # do_by_disease_id[label.omim_id] = label.label + # Our third source for OMIM names is the CTD disease database. + ctd_by_disease_id: Dict[str, CtdDiseaseEntry] = {} + for ctd_disease in self.ctd_diseases: + for disease_id in ctd_disease.alt_disease_ids: + if disease_id not in ctd_by_disease_id: + ctd_by_disease_id[disease_id] = ctd_disease + # If annotation with these names fails, Orphanet terms will be tried later. + for record in self.mim2gene_medgen: + disease_name = None + disease_ids = [record.omim_id] + if record.omim_id in hpoa_by_disease_id: + disease_name = hpoa_by_disease_id[record.omim_id].disease_name + elif record.omim_id in mondo_by_omim_id: + mondo_records = mondo_by_omim_id[record.omim_id] + disease_name = mondo_records[0].name + disease_ids += [record.mondo_id for record in mondo_records] + elif record.omim_id in ctd_by_disease_id: + disease_name = ctd_by_disease_id[record.omim_id].disease_name + elif record.omim_id in self.omim_titles: + disease_name = self.omim_titles[record.omim_id] + self.register_disease_assoc( + GeneDiseaseAssociation( + hgnc_id=record.hgnc_id, + labeled_disorders=[LabeledDisorder(term_id=record.omim_id, title=disease_name)], + disease_ids=list(sorted(set(disease_ids))), + disease_name=disease_name, + disease_definition=None, + sources=[GeneDiseaseAssociationSource.OMIM], + confidence=ConfidenceLevel.HIGH, + ) + ) + + def handle_orpha(self): + """Process the ORPHA data.""" + logger.info("Processing ORPHA...") + orpha_disorders_by_orpha_id = {d.database_id: d for d in self.orpha_disorders} + + # MONDO mapping for proper aliasing. + mondo_by_orpha_id: Dict[str, MondoDisease] = {} + for relation in GOOD_MONDO_RELATIONS: + for mondo_disease in self.mondo_diseases: + for synonym in mondo_disease.synonyms: + if ( + synonym.database_id.startswith("Orphanet:") + and relation in synonym.relation + and synonym.database_id not in mondo_by_orpha_id + ): + orpha_id = synonym.database_id.replace("Orphanet:", "ORPHA:") + mondo_by_orpha_id.setdefault(orpha_id, []).append(mondo_disease) + + for orpha_mapping in self.orpha_mappings: + database_ids = [orpha_mapping.orpha_id] + orpha_disorder = orpha_disorders_by_orpha_id[orpha_mapping.orpha_id] + for mapping in orpha_disorder.mappings: + if ( + mapping.database_id.startswith("OMIM:") + and mapping.relation in GOOD_ORPHA_RELATIONS + ): + database_ids.append(mapping.database_id) + if orpha_mapping.orpha_id in mondo_by_orpha_id: + database_ids += [ + record.mondo_id for record in mondo_by_orpha_id[orpha_mapping.orpha_id] + ] + self.register_disease_assoc( + GeneDiseaseAssociation( + hgnc_id=orpha_mapping.hgnc_id, + labeled_disorders=[ + LabeledDisorder( + term_id=orpha_mapping.orpha_id, title=orpha_disorder.database_name + ) + ], + disease_ids=list(sorted(set(database_ids))), + disease_name=orpha_disorder.database_name, + disease_definition=orpha_disorder.definition, + sources=[GeneDiseaseAssociationSource.ORPHANET], + confidence={ + OrphaStatus.ASSESSED.value: ConfidenceLevel.HIGH, + OrphaStatus.NOT_YET_ASSESSED.value: ConfidenceLevel.LOW, + }[orpha_mapping.status], + ) + ) + + def handle_panelapp(self): # noqa: C901 + """Process the PanelApp data.""" + RE_OMIM = r"(?:O?MIM:)?(\d\d\d\d\d\d)" + RE_MONDO = r"MONDO:(\d+)" + RE_ORPHA = r"ORPHA:(\d+)" + + logger.info("Processing PanelApp...") + orpha_disorders: Dict[str, OrphaDisorder] = { + disease.database_id: disease for disease in self.orpha_disorders + } + for assoc in self.panelapp_associations: + self.panelapp_assocs.setdefault(assoc.hgnc_id, []).append(assoc) + + if assoc.confidence_level == PanelappConfidence.NONE.value: + continue + if assoc.phenotypes: + for phenotype in assoc.phenotypes: + m_omim = re.search(RE_OMIM, phenotype, re.IGNORECASE) + m_mondo = re.search(RE_MONDO, phenotype, re.IGNORECASE) + m_orpha = re.search(RE_ORPHA, phenotype, re.IGNORECASE) + labeled_omim_ids: List[LabeledDisorder] = [] + labeled_orpha_ids: List[LabeledDisorder] = [] + omim_ids: List[LabeledDisorder] = [] + orpha_ids: List[LabeledDisorder] = [] + orpha_disorder: Optional[OrphaDisorder] = None + if m_mondo: + mondo_id = f"MONDO:{m_mondo.group(1)}" + omim_ids = list(self.mondo_to_omim.get(mondo_id, [])) + orpha_ids = list(sorted(self.mondo_to_orpha.get(mondo_id, set()))) + # We base the disease name and description on the first Orpha ID. + # If the association already exists, this one will be ignored. + if orpha_ids: + orpha_disorder = orpha_disorders.get(orpha_ids[0]) + elif m_orpha: + orpha_id = f"ORPHA:{m_orpha.group(1)}" + orpha_ids = [orpha_id] + omim_ids = list(self.omim_to_orpha.get(orpha_id, [])) + if orpha_ids: + orpha_disorder = orpha_disorders.get(orpha_id) + elif m_omim: + omim_id = f"OMIM:{m_omim.group(1)}" + if omim_id not in self.xlink_by_omim_id: # OMIM was a gene... + labeled_omim_ids = omim_ids = [omim_id] + orpha_ids = list(sorted(self.omim_to_orpha.get(omim_id, set()))) + # We base the disease name and description on the first Orpha ID. + # If the association already exists, this one will be ignored. + if orpha_ids: + orpha_disorder = orpha_disorders.get(orpha_ids[0]) + else: + logger.debug(f"Unresolved phenotype {phenotype} (OMIM is gene)") + else: + logger.debug(f"Unresolved phenotype {phenotype}") + + stripped_phenotype = ( + phenotype.replace(f", {omim_id}", "") # trim ", " + .replace(f" {omim_id}", "") # trim " " + .replace(omim_id, "") # trim "" + .replace("{", "") # trim curly braces in "{xxx}" + .replace("}", "") + ) + + labeled_omim_ids = [ + LabeledDisorder( + term_id=omim_id, title=self.omim_titles.get(omim_id, stripped_phenotype) + ) + for omim_id in omim_ids + ] + labeled_orpha_ids = [ + LabeledDisorder( + term_id=orpha_id, title=orpha_disorders[orpha_id].database_name + ) + for orpha_id in orpha_ids + ] + + if omim_ids or orpha_ids: + # Obtain the disease name from orpha_id/omim_id and lookup. + disease_name = None + for orpha_id in orpha_ids: + if orpha_id in orpha_disorders: + disease_name = orpha_disorders[orpha_id].database_name + break + if not disease_name: + for omim_id in omim_ids: + if omim_id in self.omim_titles: + disease_name = self.omim_titles[omim_id] + break + # Use label from PanelApp as fallback. + if not disease_name and m_omim: + disease_name = stripped_phenotype + # We found at least one OMIM/ORPHA ID. We will now created + # a new `GeneDiseaseAssociation` object from this which will + # be merged into any existing. + gene_disease_assoc = GeneDiseaseAssociation( + hgnc_id=assoc.hgnc_id, + labeled_disorders=list( + sorted(set(labeled_omim_ids + labeled_orpha_ids)) + ), + disease_ids=list(sorted(set(omim_ids + orpha_ids))), + disease_name=disease_name, + disease_definition=orpha_disorder.definition + if orpha_disorder + else None, + sources=[GeneDiseaseAssociationSource.PANELAPP], + confidence={ + PanelappConfidence.GREEN.value: ConfidenceLevel.HIGH, + PanelappConfidence.AMBER.value: ConfidenceLevel.MEDIUM, + PanelappConfidence.RED.value: ConfidenceLevel.LOW, + }[assoc.confidence_level], + ) + self.register_disease_assoc(gene_disease_assoc) + else: + logger.debug(f"UNRESOLVED: {assoc.panel.name}") + + +if __name__ == "__main__": + Integrator().run("store.pickle.gz" if DEV_MODE else None) diff --git a/scripts/genes-orpha-diseases.py b/scripts/genes-orpha-diseases.py index 75da52d..ad9a426 100755 --- a/scripts/genes-orpha-diseases.py +++ b/scripts/genes-orpha-diseases.py @@ -1,38 +1,72 @@ #!/usr/bin/env python -"""Helper script to extract gene-disease association from orphapacket.""" +"""Helper script to retrieve ORDO gene-disease associations from OrphaData. + +When run in CI mode environment variable "CI" is "true" then we will copy +the manually created file from excerpts/__orphadata__. Note that this is +a deviation from the usual approach of putting the URL into `download_urls`. +""" -import csv import json -import pathlib +import os import sys +import httpx +import trio +from loguru import logger + +#: URL for listing ORPHAcode. +URL_ORPHACODE_LIST = "https://api.orphadata.com/rd-cross-referencing/orphacodes" +#: URL template for getting information on one ORPHAcode. +URL_ORPHACODE_GET = "https://api.orphadata.com/rd-cross-referencing/orphacodes/{}?lang=en" +#: URL template for getting enes on one ORPHAcode. +URL_ORPHACODE_GET_GENE = "https://api.orphadata.com/rd-associated-genes/orphacodes/{}" + +#: Whether we run in test mode. +RUNS_IN_CI = os.environ.get("CI", "false") == "true" + + +async def main(): + async with httpx.AsyncClient() as client: + logger.info("Fetching ORPHAcode list...") + lst = await client.get(URL_ORPHACODE_LIST) + logger.info("...done") + disease_ids = {disease["ORPHAcode"] for disease in lst.json()["data"]["results"]} + + async def work(no: int, orpha_id: int, limiter: trio.CapacityLimiter): + async with limiter: + async with httpx.AsyncClient() as client: + try: + cross_references = (await client.get(URL_ORPHACODE_GET.format(orpha_id))).json() + disease_genes = ( + await client.get(URL_ORPHACODE_GET_GENE.format(orpha_id), timeout=60) + ).json() + except Exception as e: + logger.error(f"Error fetching {orpha_id}: {e}") + raise + finally: + if no % 100 == 0: + logger.info(f"done fetching ORPHAcode details {no}/{len(disease_ids)}") + json.dump( + { + "orpha_id": f"ORPHA:{orpha_id}", + "cross_references": cross_references["data"]["results"], + "disease_genes": disease_genes.get("data", {}).get("results"), + }, + sys.stdout, + ) + sys.stdout.write("\n") -def main(): - symbol_to_hgnc = {} - with open(sys.argv[1], "rt") as inputf: - reader = csv.DictReader(inputf, delimiter="\t") - for record in reader: - symbol_to_hgnc[record["gene_symbol"]] = record["hgnc_id"] - - print(f"# xlink entries: {len(symbol_to_hgnc)}", file=sys.stderr) - - base_path = pathlib.Path(sys.argv[2]) - print("\t".join(["hgnc_id", "orpha_id", "disease_name"])) - for json_path in sorted(base_path.glob("*.json")): - with json_path.open("rt") as inputf: - data = json.load(inputf) - elem_top = data["Orphapacket"] - if elem_top.get("DisorderType", {}).get("value") != "Disease": - continue # skip categories - disease_name = elem_top["Label"] - orpha_id = elem_top["PURL"].replace("http://www.orpha.net/ORDO/Orphanet_", "ORPHA:") - elem_genes = elem_top.get("Genes", []) - for elem_gene in elem_genes: - gene_symbol = elem_gene["Gene"]["Symbol"] - hgnc_id = symbol_to_hgnc.get(gene_symbol) - if hgnc_id: # skip if no HGNC ID exists, maybe withdrawn? - print("\t".join(map(str, [hgnc_id, orpha_id, disease_name]))) + logger.info("Fetching ORPHAcode details...") + limiter = trio.CapacityLimiter(10) + async with trio.open_nursery() as nursery: + for no, disease_id in enumerate(disease_ids): + nursery.start_soon(work, no, disease_id, limiter) + logger.info("...done") if __name__ == "__main__": - main() + if RUNS_IN_CI: + with open("excerpt-data/__orphadata__/orphadata.jsonl", "rt") as inputf: + sys.stdout.write(inputf.read()) + else: + trio.run(main) diff --git a/scripts/genes-xlink-hgnc.jq b/scripts/genes-xlink-hgnc.jq index 271e579..e6be805 100644 --- a/scripts/genes-xlink-hgnc.jq +++ b/scripts/genes-xlink-hgnc.jq @@ -1,7 +1,7 @@ # Create gene identifier interlink table. # Create header -["hgnc_id", "ensembl_gene_id", "entrez_id", "gene_symbol"], +["hgnc_id", "ensembl_gene_id", "entrez_id", "gene_symbol", "omim_ids"], # Then, process the file ( # For each entry in .response.docs @@ -12,7 +12,8 @@ .hgnc_id // "", .ensembl_gene_id // "", .entrez_id // "", - .symbol // "" + .symbol // "", + ((.omim_id // []) | join(",")) ] ) # Convert everything to TSV diff --git a/varfish_db_downloader/versions.py b/varfish_db_downloader/versions.py index 4ec6af1..1176104 100644 --- a/varfish_db_downloader/versions.py +++ b/varfish_db_downloader/versions.py @@ -102,8 +102,8 @@ class DataVersions: acmg_sf: str #: HPO hpo: str - #: OrphaPacket - orphapacket: str + #: Orphadata + orphadata: str #: Pathogenic MMS patho_mms: str #: Mehari transcript data. @@ -161,8 +161,8 @@ class DataVersions: refseq_38="GCF_000001405.40+RS_2023_03", dbsnp="b151", acmg_sf="3.1", - hpo="20230606", - orphapacket="10.1", + hpo="20240116", + orphadata=TODAY, patho_mms="20220730", mehari_tx="0.4.4", clinvar_release=CLINVAR_RELEASE,