From 2f61cc704b35ce9c1cca26793a4638ac8ab725f3 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 2 Jan 2024 16:09:17 +0100 Subject: [PATCH 1/2] feat: integrating PanelApp download for annonars (#79) --- Snakefile | 1 + download_urls.yml | 13 ++++++++ excerpt-data/c9c7d6df0e24b954/__index__ | 3 ++ excerpt-data/c9c7d6df0e24b954/url.txt | 3 ++ excerpt-data/cdaaf7a3f7595d3d/__index__ | 3 ++ excerpt-data/cdaaf7a3f7595d3d/url.txt | 3 ++ excerpt-data/ebc07f725c64907d/__index__ | 3 ++ excerpt-data/ebc07f725c64907d/url.txt | 3 ++ rules/work/genes/panelapp.smk | 43 +++++++++++++++++++++++++ varfish_db_downloader/wget.py | 5 ++- 10 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 excerpt-data/c9c7d6df0e24b954/__index__ create mode 100644 excerpt-data/c9c7d6df0e24b954/url.txt create mode 100644 excerpt-data/cdaaf7a3f7595d3d/__index__ create mode 100644 excerpt-data/cdaaf7a3f7595d3d/url.txt create mode 100644 excerpt-data/ebc07f725c64907d/__index__ create mode 100644 excerpt-data/ebc07f725c64907d/url.txt create mode 100644 rules/work/genes/panelapp.smk diff --git a/Snakefile b/Snakefile index c3e712f..43eacca 100644 --- a/Snakefile +++ b/Snakefile @@ -349,6 +349,7 @@ include: "rules/work/genes/hgnc.smk" include: "rules/work/genes/mehari_data_tx.smk" include: "rules/work/genes/ncbi.smk" include: "rules/work/genes/omim.smk" +include: "rules/work/genes/panelapp.smk" include: "rules/work/genes/orphapacket.smk" include: "rules/work/genes/rcnv.smk" include: "rules/work/genes/shet.smk" diff --git a/download_urls.yml b/download_urls.yml index 6bfcbb6..e7a9609 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -1,3 +1,16 @@ +- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/ + excerpt_strategy: + strategy: no-excerpt + count: null +- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/?page=2 + excerpt_strategy: + strategy: no-excerpt + count: null +- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/?page=3 + excerpt_strategy: + strategy: no-excerpt + count: null + # dbNSFP v4.5a - url: https://dbnsfp.s3.amazonaws.com/dbNSFP4.5a.zip excerpt_strategy: diff --git a/excerpt-data/c9c7d6df0e24b954/__index__ b/excerpt-data/c9c7d6df0e24b954/__index__ new file mode 100644 index 0000000..b7d18d7 --- /dev/null +++ b/excerpt-data/c9c7d6df0e24b954/__index__ @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9399a8c284d4f3141b19a8e943618ec53c52e2523af9b8b8222d4ead0a8a633 +size 151718 diff --git a/excerpt-data/c9c7d6df0e24b954/url.txt b/excerpt-data/c9c7d6df0e24b954/url.txt new file mode 100644 index 0000000..1d06402 --- /dev/null +++ b/excerpt-data/c9c7d6df0e24b954/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a5d3462d70d2fdab6ac9dfa26cbac6437db26ddfc2fcb5cf0d9796cdce77d2d +size 63 diff --git a/excerpt-data/cdaaf7a3f7595d3d/__index__ b/excerpt-data/cdaaf7a3f7595d3d/__index__ new file mode 100644 index 0000000..2c804ae --- /dev/null +++ b/excerpt-data/cdaaf7a3f7595d3d/__index__ @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81847a00717aad78c26fcb09ab10c918ef8e8b0eba906f8b4beef24a91f4435c +size 195929 diff --git a/excerpt-data/cdaaf7a3f7595d3d/url.txt b/excerpt-data/cdaaf7a3f7595d3d/url.txt new file mode 100644 index 0000000..9e3f355 --- /dev/null +++ b/excerpt-data/cdaaf7a3f7595d3d/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13001a0fc24aa22f68896e794c26610706c99a84e013581a1c7c32b6c01a4785 +size 63 diff --git a/excerpt-data/ebc07f725c64907d/__index__ b/excerpt-data/ebc07f725c64907d/__index__ new file mode 100644 index 0000000..bc93c8d --- /dev/null +++ b/excerpt-data/ebc07f725c64907d/__index__ @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afedd3471ced3f34cc5be50a98cfcb078f344e7330095ceb5b80ce0849edf48a +size 160540 diff --git a/excerpt-data/ebc07f725c64907d/url.txt b/excerpt-data/ebc07f725c64907d/url.txt new file mode 100644 index 0000000..46f4133 --- /dev/null +++ b/excerpt-data/ebc07f725c64907d/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b7df3c76ef68022d4de20d77d6e6b3876497864c3842866f60d97660de64d2 +size 56 diff --git a/rules/work/genes/panelapp.smk b/rules/work/genes/panelapp.smk new file mode 100644 index 0000000..e71240e --- /dev/null +++ b/rules/work/genes/panelapp.smk @@ -0,0 +1,43 @@ +## Rules related to AlphaMissense per-gene scores + + +import os +import subprocess +import sys +import tempfile + + +rule genes_panelapp_download: # -- download AlphaMissense per-gene scores + output: + jsonl="work/download/genes/panelapp/{date}/panelapp.jsonl", + run: + base_url = "https://panelapp.genomicsengland.co.uk/api/v1" + + pages = [] + page_no = 1 + page_count = None + url = f"{base_url}/entities/" + with tempfile.TemporaryDirectory() as tmpdir: + while url: + print( + f"downloading page {page_no}/{page_count if page_count else '?'}...", + file=sys.stderr, + ) + subprocess.check_call(["wget", "-O", f"{tmpdir}/page.json", url]) + with open(f"{tmpdir}/page.json", "rt") as f: + page = json.load(f) + pages.append(page) + url = page.get("next") + page_no += 1 + if not page_count: + per_page = len(page.get("results", [None])) + page_count = (page.get("count") + per_page - 1) // per_page + if os.environ.get("CI", None) == "true" and page_no > 2: + print("CI mode: only downloading first 2 pages", file=sys.stderr) + break + + os.makedirs(f"work/download/genes/panelapp/{wildcards.date}", exist_ok=True) + with open(output.jsonl, "wt") as f: + for page in pages: + for result in page.get("results", []): + print(json.dumps(result), file=f) diff --git a/varfish_db_downloader/wget.py b/varfish_db_downloader/wget.py index b71a93e..36cb659 100644 --- a/varfish_db_downloader/wget.py +++ b/varfish_db_downloader/wget.py @@ -250,4 +250,7 @@ def copy_excerpt(url: UrlEntry, data_dir: str, output_document: str): basename = parsed.path.split("/")[-1] excerpt_path = in_path / basename click.echo(err=True, message="copying {} => {}".format(excerpt_path, output_document)) - shutil.copy(excerpt_path, output_document) + if os.path.isdir(excerpt_path): + shutil.copy(f"{excerpt_path}/__index__", output_document) + else: + shutil.copy(excerpt_path, output_document) From 98204dae7373191efd4259e91e4054b85a70ee61 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 4 Jan 2024 15:40:34 +0100 Subject: [PATCH 2/2] wip --- download_urls.yml | 4 ++-- environment.yml | 2 +- .../GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz | 0 excerpt-data/4c4ffa6ddc180f40/url.txt | 3 +++ .../GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt | 0 excerpt-data/4efb53cbe56f8290/url.txt | 3 +++ excerpt-data/95e8d788836873e9/url.txt | 3 --- excerpt-data/d0a5951ccb4cd824/url.txt | 3 --- rules/output/annonars/genes.smk | 2 ++ rules/work/genes/gtex.smk | 4 ++-- 10 files changed, 13 insertions(+), 11 deletions(-) rename excerpt-data/{95e8d788836873e9 => 4c4ffa6ddc180f40}/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz (100%) create mode 100644 excerpt-data/4c4ffa6ddc180f40/url.txt rename excerpt-data/{d0a5951ccb4cd824 => 4efb53cbe56f8290}/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt (100%) create mode 100644 excerpt-data/4efb53cbe56f8290/url.txt delete mode 100644 excerpt-data/95e8d788836873e9/url.txt delete mode 100644 excerpt-data/d0a5951ccb4cd824/url.txt diff --git a/download_urls.yml b/download_urls.yml index e7a9609..a4f0af8 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -91,12 +91,12 @@ - url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv - url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv -- url: https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt +- url: https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt excerpt_strategy: strategy: no-excerpt count: null -- url: https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz +- url: https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz excerpt_strategy: strategy: manual count: null diff --git a/environment.yml b/environment.yml index caf11e8..1f42695 100644 --- a/environment.yml +++ b/environment.yml @@ -41,7 +41,7 @@ dependencies: # Parallel (de)compression. - pigz # Varfish related - - annonars =0.32.0 + - annonars =0.33.0 - viguno =0.2.0 - mehari =0.21.1 - varfish-server-worker =0.10.2 diff --git a/excerpt-data/95e8d788836873e9/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz b/excerpt-data/4c4ffa6ddc180f40/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz similarity index 100% rename from excerpt-data/95e8d788836873e9/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz rename to excerpt-data/4c4ffa6ddc180f40/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz diff --git a/excerpt-data/4c4ffa6ddc180f40/url.txt b/excerpt-data/4c4ffa6ddc180f40/url.txt new file mode 100644 index 0000000..f50b3d7 --- /dev/null +++ b/excerpt-data/4c4ffa6ddc180f40/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e96d926c2e9db720d9c852898dadae9e212ee31f5f201ef94a5201ba7f2e0ed +size 120 diff --git a/excerpt-data/d0a5951ccb4cd824/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt b/excerpt-data/4efb53cbe56f8290/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt similarity index 100% rename from excerpt-data/d0a5951ccb4cd824/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt rename to excerpt-data/4efb53cbe56f8290/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt diff --git a/excerpt-data/4efb53cbe56f8290/url.txt b/excerpt-data/4efb53cbe56f8290/url.txt new file mode 100644 index 0000000..adc73c6 --- /dev/null +++ b/excerpt-data/4efb53cbe56f8290/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff230dd9fe588feac8473e0c525a99b5a8f4629309b2899a6c5dd64b69f1102c +size 124 diff --git a/excerpt-data/95e8d788836873e9/url.txt b/excerpt-data/95e8d788836873e9/url.txt deleted file mode 100644 index 2bdc79c..0000000 --- a/excerpt-data/95e8d788836873e9/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c0be28ae237143a1b1143b6e129be9e652346c01c4c7f9b924ba0857c247732 -size 122 diff --git a/excerpt-data/d0a5951ccb4cd824/url.txt b/excerpt-data/d0a5951ccb4cd824/url.txt deleted file mode 100644 index 90f8837..0000000 --- a/excerpt-data/d0a5951ccb4cd824/url.txt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d9cc185d363d4d24c084b1e78bbc4c6e21cd06c1147a0b14e1405cf5ef3ffce -size 116 diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk index b0833c3..b461a34 100644 --- a/rules/output/annonars/genes.smk +++ b/rules/output/annonars/genes.smk @@ -12,6 +12,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file ncbi="work/genes/entrez/{date}/gene_info.jsonl", omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv", orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv", + panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl", rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv", shet="work/genes/shet/2019/shet_weghorn_2019.tsv", gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz", @@ -44,6 +45,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file --path-in-hgnc {input.hgnc} \ --path-in-omim {input.omim} \ --path-in-orpha {input.orpha} \ + --path-in-panelapp {input.panelapp} \ --path-in-ncbi {input.ncbi} \ --path-in-rcnv {input.rcnv} \ --path-in-shet {input.shet} \ diff --git a/rules/work/genes/gtex.smk b/rules/work/genes/gtex.smk index c33792e..9457f1a 100644 --- a/rules/work/genes/gtex.smk +++ b/rules/work/genes/gtex.smk @@ -34,11 +34,11 @@ rule genes_gtex_v8_download: # -- download GTex v8 gene expression data r""" wget --no-check-certificate \ -O {output.attributes} \ - https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt + https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt wget --no-check-certificate \ -O {output.genes_tpm} \ - https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz + https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz md5sum {output.attributes} > {output.attributes_md5} md5sum {output.genes_tpm} > {output.genes_tpm_md5}