From 664295d8db03130bb0a3b52520f17b166874e40b Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 22 Nov 2023 09:07:13 +0100 Subject: [PATCH 1/2] feat: build annonars regions (clingen dosage) (#67) --- Snakefile | 7 ++ download_urls.yml | 9 +++ ...GCF_000001405.40_GRCh38.p14_genomic.gff.gz | 3 + excerpt-data/98935d27cc8f0dc0/url.txt | 3 + ...GCF_000001405.25_GRCh37.p13_genomic.gff.gz | 3 + excerpt-data/f0ed4b0862f1b46b/url.txt | 3 + rules/output/annonars/functional.smk | 67 +++++++++++++++++++ rules/output/annonars/functional.spec.yaml | 16 +++++ rules/output/annonars/regions.smk | 52 ++++++++++++++ rules/output/annonars/regions.spec.yaml | 14 ++++ varfish_db_downloader/versions.py | 6 ++ 11 files changed, 183 insertions(+) create mode 100644 excerpt-data/98935d27cc8f0dc0/GCF_000001405.40_GRCh38.p14_genomic.gff.gz create mode 100644 excerpt-data/98935d27cc8f0dc0/url.txt create mode 100644 excerpt-data/f0ed4b0862f1b46b/GCF_000001405.25_GRCh37.p13_genomic.gff.gz create mode 100644 excerpt-data/f0ed4b0862f1b46b/url.txt create mode 100644 rules/output/annonars/functional.smk create mode 100644 rules/output/annonars/functional.spec.yaml create mode 100644 rules/output/annonars/regions.smk create mode 100644 rules/output/annonars/regions.spec.yaml diff --git a/Snakefile b/Snakefile index 8e4e82a..c1ea664 100644 --- a/Snakefile +++ b/Snakefile @@ -165,6 +165,11 @@ rule all: f"output/full/annonars/gnomad-sv-exomes-grch38-{DV.gnomad_cnv4}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/gnomad-sv-genomes-grch37-{DV.gnomad_sv}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/gnomad-sv-genomes-grch38-{DV.gnomad_sv4}+{PV.annonars}/rocksdb/IDENTITY", + # ----- sequence annotation + f"output/full/annonars/functional-grch37-{DV.refseq_fe_37}+{PV.annonars}/rocksdb/IDENTITY", + f"output/full/annonars/functional-grch38-{DV.refseq_fe_38}+{PV.annonars}/rocksdb/IDENTITY", + f"output/full/annonars/regions-grch37-{DV.today}+{PV.annonars}/rocksdb/IDENTITY", + f"output/full/annonars/regions-grch38-{DV.today}+{PV.annonars}/rocksdb/IDENTITY", # ----- conservation f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY", f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY", @@ -386,6 +391,8 @@ include: "rules/output/annonars/gnomad_mtdna.smk" include: "rules/output/annonars/gnomad_sv.smk" include: "rules/output/annonars/helix.smk" include: "rules/output/annonars/genes.smk" +include: "rules/output/annonars/functional.smk" +include: "rules/output/annonars/regions.smk" # ---- worker include: "rules/output/worker/patho_mms.smk" include: "rules/output/worker/clinvar.smk" diff --git a/download_urls.yml b/download_urls.yml index 687cd9b..33914fa 100644 --- a/download_urls.yml +++ b/download_urls.yml @@ -1,3 +1,12 @@ +- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz + excerpt_strategy: + strategy: gz-head + count: 1000 +- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz + excerpt_strategy: + strategy: gz-head + count: 1000 + - url: https://storage.googleapis.com/gcp-public-data--gnomad/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz excerpt_strategy: strategy: gz-head diff --git a/excerpt-data/98935d27cc8f0dc0/GCF_000001405.40_GRCh38.p14_genomic.gff.gz b/excerpt-data/98935d27cc8f0dc0/GCF_000001405.40_GRCh38.p14_genomic.gff.gz new file mode 100644 index 0000000..f2874f4 --- /dev/null +++ b/excerpt-data/98935d27cc8f0dc0/GCF_000001405.40_GRCh38.p14_genomic.gff.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c9fcd2bed4045002e27b5f8b36be6d798f21e63f277d133df49c1466b0c0680 +size 20157 diff --git a/excerpt-data/98935d27cc8f0dc0/url.txt b/excerpt-data/98935d27cc8f0dc0/url.txt new file mode 100644 index 0000000..0b7c3d8 --- /dev/null +++ b/excerpt-data/98935d27cc8f0dc0/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:113a71bbc89339505e4d34c739662ce4e11ce40c7773607aea55719aa095e49e +size 141 diff --git a/excerpt-data/f0ed4b0862f1b46b/GCF_000001405.25_GRCh37.p13_genomic.gff.gz b/excerpt-data/f0ed4b0862f1b46b/GCF_000001405.25_GRCh37.p13_genomic.gff.gz new file mode 100644 index 0000000..1b43d3a --- /dev/null +++ b/excerpt-data/f0ed4b0862f1b46b/GCF_000001405.25_GRCh37.p13_genomic.gff.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c6578cae062849e5aa1d59c8a9a0e3b1bbd0173b7d0700eb7f22486154b7f7 +size 18765 diff --git a/excerpt-data/f0ed4b0862f1b46b/url.txt b/excerpt-data/f0ed4b0862f1b46b/url.txt new file mode 100644 index 0000000..b9b3a72 --- /dev/null +++ b/excerpt-data/f0ed4b0862f1b46b/url.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3523b242c4f92e346384bf1855bf53417f48d0b9814d9a8bf7891cbbcfebe7 +size 150 diff --git a/rules/output/annonars/functional.smk b/rules/output/annonars/functional.smk new file mode 100644 index 0000000..c4a6d13 --- /dev/null +++ b/rules/output/annonars/functional.smk @@ -0,0 +1,67 @@ +## Rules to create build annonars functional annotation database.. + + +rule work_annonars_functional_download_37: # -- download functional data for GRCh37 + output: + "work/download/refseq/grch37/{version}/{assembly}_genomic.gff.gz", + shell: + r""" + wget -O {output} \ + https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz + """ + + +rule work_annonars_functional_download_38: # -- download functional data for GRCh37 + output: + "work/download/refseq/grch38/{version}/{assembly}_genomic.gff.gz", + shell: + r""" + wget -O {output} \ + https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz + """ + + +def output_annonars_functional_input(wildcards): + if wildcards.genome_release == "grch37": + return f"work/download/refseq/grch37/{DV.refseq_fe_37}/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + else: + return f"work/download/refseq/grch38/{DV.refseq_fe_38}/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + + +rule output_annonars_functional: # -- build annonars functional RocksDB file + input: + output_annonars_functional_input, + output: + rocksdb_identity=( + "output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/" + "rocksdb/IDENTITY" + ), + spec_yaml=( + "output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/spec.yaml" + ), + wildcard_constraints: + v_refseq=RE_VERSION, + v_annonars=RE_VERSION, + shell: + r""" + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + zgrep '^#\|RefSeqFE' {input} > $TMPDIR/tmp.gff + + annonars functional import -vvv \ + --genome-release {wildcards.genome_release} \ + --path-in-gff $TMPDIR/tmp.gff \ + --path-out-rocksdb $(dirname {output.rocksdb_identity}) + + varfish-db-downloader tpl \ + --template rules/output/annonars/functional.spec.yaml \ + --value today={TODAY} \ + \ + --value version={wildcards.v_refseq}+{wildcards.v_annonars} \ + --value v_refseq={wildcards.v_refseq} \ + \ + --value v_annonars={wildcards.v_annonars} \ + --value v_downloader={PV.downloader} \ + > {output.spec_yaml} + """ diff --git a/rules/output/annonars/functional.spec.yaml b/rules/output/annonars/functional.spec.yaml new file mode 100644 index 0000000..a6dafc5 --- /dev/null +++ b/rules/output/annonars/functional.spec.yaml @@ -0,0 +1,16 @@ +dc.identifier: annonars/functional:{{ version }}-{{ genome_release }} +dc.title: annonars functional elements RocksDB +dc.creator: VarFish Developer Teams +dc.format: application/x-rocksdb +dc.date: {{ today }} +x-version: {{ version }} +x-genome-release: {{ genome_release }} +dc.description: | + RocksDB built from RefSeq Functional Elements (and other sources in + the future). +dc.source: + - PMID:34876495 + - https://www.ncbi.nlm.nih.gov/refseq/ +x-created-from: + - name: RefSeq Functional Elements + version: {{ v_refseq }} diff --git a/rules/output/annonars/regions.smk b/rules/output/annonars/regions.smk new file mode 100644 index 0000000..1084571 --- /dev/null +++ b/rules/output/annonars/regions.smk @@ -0,0 +1,52 @@ +## Rules to create build annonars regions annotation database.. + + +rule work_annonars_regions_download: # -- download clingen regions + output: + "work/download/clingen/{genome_release}/{today}/ClinGen_region_curation_list_{genome_release}.tsv", + shell: + r""" + if [[ "{wildcards.genome_release}" == "grch38" ]]; then + GENOME=GRCh37 + else + GENOME=GRCh38 + fi + + wget -O {output} \ + ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_$GENOME.tsv + """ + + +rule output_annonars_regions: # -- build annonars regions RocksDB file + input: + "work/download/clingen/{genome_release}/{date}/ClinGen_region_curation_list_{genome_release}.tsv", + output: + rocksdb_identity=( + "output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/" "rocksdb/IDENTITY" + ), + spec_yaml=("output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/spec.yaml"), + wildcard_constraints: + v_refseq=RE_VERSION, + v_annonars=RE_VERSION, + shell: + r""" + if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then + >&2 echo "{wildcards.date} is not today" + exit 1 + fi + + annonars regions import -vvv \ + --genome-release {wildcards.genome_release} \ + --path-in-clingen {input} \ + --path-out-rocksdb $(dirname {output.rocksdb_identity}) + + varfish-db-downloader tpl \ + --template rules/output/annonars/regions.spec.yaml \ + --value today={TODAY} \ + \ + --value version={wildcards.date}+{wildcards.v_annonars} \ + \ + --value v_annonars={wildcards.v_annonars} \ + --value v_downloader={PV.downloader} \ + > {output.spec_yaml} + """ diff --git a/rules/output/annonars/regions.spec.yaml b/rules/output/annonars/regions.spec.yaml new file mode 100644 index 0000000..88e5206 --- /dev/null +++ b/rules/output/annonars/regions.spec.yaml @@ -0,0 +1,14 @@ +dc.identifier: annonars/regions:{{ version }}-{{ genome_release }} +dc.title: annonars regions annotation RocksDB +dc.creator: VarFish Developer Teams +dc.format: application/x-rocksdb +dc.date: {{ today }} +x-version: {{ version }} +x-genome-release: {{ genome_release }} +dc.description: | + RocksDB with region annotation. +dc.source: + - https://search.clinicalgenome.org/kb/gene-dosage +x-created-from: + - name: ClinGen Region Dosage Pathogenicity + version: {{ today }} diff --git a/varfish_db_downloader/versions.py b/varfish_db_downloader/versions.py index 85fc916..28b37eb 100644 --- a/varfish_db_downloader/versions.py +++ b/varfish_db_downloader/versions.py @@ -111,6 +111,10 @@ class DataVersions: #: Marker file for the tracks version. This allows us to update the #: tracks BED files later on. tracks: str + #: RefSeq functional elements for GRCh37. + refseq_fe_37: str + #: RefSeq functional elements for GRCh38. + refseq_fe_38: str #: The data versions to use. @@ -158,6 +162,8 @@ class DataVersions: clinvar_release=CLINVAR_RELEASE, clinvar_version=CLINVAR_VERSION, tracks="0", + refseq_fe_37="105.20201022", + refseq_fe_38="110", ) From f00db04286b6406346434be2fee4485740f84a22 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 22 Nov 2023 09:08:59 +0100 Subject: [PATCH 2/2] feat: adding annonars functional (#68)