Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: integrating PanelApp download for annonars (#79) #80

Merged
merged 2 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ include: "rules/work/genes/hgnc.smk"
include: "rules/work/genes/mehari_data_tx.smk"
include: "rules/work/genes/ncbi.smk"
include: "rules/work/genes/omim.smk"
include: "rules/work/genes/panelapp.smk"
include: "rules/work/genes/orphapacket.smk"
include: "rules/work/genes/rcnv.smk"
include: "rules/work/genes/shet.smk"
Expand Down
17 changes: 15 additions & 2 deletions download_urls.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/
excerpt_strategy:
strategy: no-excerpt
count: null
- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/?page=2
excerpt_strategy:
strategy: no-excerpt
count: null
- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/?page=3
excerpt_strategy:
strategy: no-excerpt
count: null

# dbNSFP v4.5a
- url: https://dbnsfp.s3.amazonaws.com/dbNSFP4.5a.zip
excerpt_strategy:
Expand Down Expand Up @@ -78,12 +91,12 @@
- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv
- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv

- url: https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
- url: https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
excerpt_strategy:
strategy: no-excerpt
count: null

- url: https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
- url: https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
excerpt_strategy:
strategy: manual
count: null
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies:
# Parallel (de)compression.
- pigz
# Varfish related
- annonars =0.32.0
- annonars =0.33.0
- viguno =0.2.0
- mehari =0.21.1
- varfish-server-worker =0.10.2
Expand Down
3 changes: 3 additions & 0 deletions excerpt-data/4c4ffa6ddc180f40/url.txt
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/4efb53cbe56f8290/url.txt
Git LFS file not shown
3 changes: 0 additions & 3 deletions excerpt-data/95e8d788836873e9/url.txt

This file was deleted.

3 changes: 3 additions & 0 deletions excerpt-data/c9c7d6df0e24b954/__index__
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/c9c7d6df0e24b954/url.txt
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/cdaaf7a3f7595d3d/__index__
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/cdaaf7a3f7595d3d/url.txt
Git LFS file not shown
3 changes: 0 additions & 3 deletions excerpt-data/d0a5951ccb4cd824/url.txt

This file was deleted.

3 changes: 3 additions & 0 deletions excerpt-data/ebc07f725c64907d/__index__
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/ebc07f725c64907d/url.txt
Git LFS file not shown
2 changes: 2 additions & 0 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
ncbi="work/genes/entrez/{date}/gene_info.jsonl",
omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl",
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
Expand Down Expand Up @@ -44,6 +45,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
--path-in-hgnc {input.hgnc} \
--path-in-omim {input.omim} \
--path-in-orpha {input.orpha} \
--path-in-panelapp {input.panelapp} \
--path-in-ncbi {input.ncbi} \
--path-in-rcnv {input.rcnv} \
--path-in-shet {input.shet} \
Expand Down
4 changes: 2 additions & 2 deletions rules/work/genes/gtex.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ rule genes_gtex_v8_download: # -- download GTex v8 gene expression data
r"""
wget --no-check-certificate \
-O {output.attributes} \
https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt

wget --no-check-certificate \
-O {output.genes_tpm} \
https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz

md5sum {output.attributes} > {output.attributes_md5}
md5sum {output.genes_tpm} > {output.genes_tpm_md5}
Expand Down
43 changes: 43 additions & 0 deletions rules/work/genes/panelapp.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
## Rules related to AlphaMissense per-gene scores


import os
import subprocess
import sys
import tempfile


rule genes_panelapp_download: # -- download AlphaMissense per-gene scores
output:
jsonl="work/download/genes/panelapp/{date}/panelapp.jsonl",
run:
base_url = "https://panelapp.genomicsengland.co.uk/api/v1"

pages = []
page_no = 1
page_count = None
url = f"{base_url}/entities/"
with tempfile.TemporaryDirectory() as tmpdir:
while url:
print(
f"downloading page {page_no}/{page_count if page_count else '?'}...",
file=sys.stderr,
)
subprocess.check_call(["wget", "-O", f"{tmpdir}/page.json", url])
with open(f"{tmpdir}/page.json", "rt") as f:
page = json.load(f)
pages.append(page)
url = page.get("next")
page_no += 1
if not page_count:
per_page = len(page.get("results", [None]))
page_count = (page.get("count") + per_page - 1) // per_page
if os.environ.get("CI", None) == "true" and page_no > 2:
print("CI mode: only downloading first 2 pages", file=sys.stderr)
break

os.makedirs(f"work/download/genes/panelapp/{wildcards.date}", exist_ok=True)
with open(output.jsonl, "wt") as f:
for page in pages:
for result in page.get("results", []):
print(json.dumps(result), file=f)
5 changes: 4 additions & 1 deletion varfish_db_downloader/wget.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,7 @@ def copy_excerpt(url: UrlEntry, data_dir: str, output_document: str):
basename = parsed.path.split("/")[-1]
excerpt_path = in_path / basename
click.echo(err=True, message="copying {} => {}".format(excerpt_path, output_document))
shutil.copy(excerpt_path, output_document)
if os.path.isdir(excerpt_path):
shutil.copy(f"{excerpt_path}/__index__", output_document)
else:
shutil.copy(excerpt_path, output_document)
Loading