IKIM-Essen · alethomas · Mar 4, 2022 · Feb 21, 2022 · Feb 21, 2022 · Feb 21, 2022
diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml
@@ -82,6 +82,12 @@ preprocessing:
   amplicon-reference: "MN908947"
 
 variant-calling:
+  # genome annotation to use. Can be
+  # 'orf' (e.g. ORF1ab, S, etc..) and/or
+  # 'protein' (e.g. nsp1, Hel, Spike Protein S1, etc..)
+  annotations:
+    - "orf"
+    - "protein"
   # false discovery rate to control for
   fdr: 0.05
   # downsample loci to this read depth

diff --git a/config/config.yaml b/config/config.yaml
@@ -91,6 +91,12 @@ assembly:
   min-variant-prob: 0.95
 
 variant-calling:
+  # genome annotation to use. Can be
+  # 'orf' (e.g. ORF1ab, S, etc..) and/or
+  # 'protein' (e.g. nsp1, Hel, Spike Protein S1, etc..)
+  annotations:
+    - "orf"
+    - "protein"
   # false discovery rate to control for
   fdr: 0.05
   # downsample loci to this read depth

diff --git a/workflow/envs/genometools.yaml b/workflow/envs/genometools.yaml
@@ -0,0 +1,5 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - genometools-genometools =1.6.2
diff --git a/workflow/envs/gff3sort.yaml b/workflow/envs/gff3sort.yaml
@@ -0,0 +1,5 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - gff3sort = 0.1
diff --git a/workflow/envs/ucsc.yaml b/workflow/envs/ucsc.yaml
@@ -0,0 +1,5 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - ucsc-bigbedtobed =377
diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk
@@ -159,8 +159,8 @@ rule filter_chr0:
 rule assembly_polishing_illumina:
     input:
         fasta="results/{date}/contigs/ordered/{sample}.fasta",
-        bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.bcf",
-        bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.bcf.csi",
+        bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.orf.bcf",
+        bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.orf.bcf.csi",
     output:
         report(
             "results/{date}/polishing/bcftools-illumina/{sample}.fasta",

diff --git a/workflow/rules/benchmarking_common.smk b/workflow/rules/benchmarking_common.smk
@@ -21,8 +21,8 @@ def get_test_cases_variant_calls(technology, suffix="", get="path"):
             len(sample_table) == 1
         ), f'Too many sampels are defined with technology "{technology}" for test case {wildcards.test_case}.'
 
-        bcf_path_high = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf{suffix}"
-        bcf_path_low = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.low-impact.bcf{suffix}"
+        bcf_path_high = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf{suffix}"
+        bcf_path_low = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.low-impact.orf.bcf{suffix}"
 
         high_impact = expand(
             bcf_path_high,

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -239,7 +239,7 @@ def get_report_samples(wildcards):
 def get_merge_calls_input(suffix):
     def inner(wildcards):
         return expand(
-            "results/{{date}}/filtered-calls/ref~{{reference}}/{{sample}}.{{clonality}}.{{filter}}.{vartype}.fdr-controlled{suffix}",
+            "results/{{date}}/filtered-calls/ref~{{reference}}/{{sample}}.{{clonality}}.{{filter}}.{vartype}.{{annotation}}.fdr-controlled{suffix}",
             suffix=suffix,
             vartype=VARTYPES,
         )
@@ -602,7 +602,7 @@ def get_target_events(wildcards):
 
 def get_control_fdr_input(wildcards):
     if wildcards.reference == "main":
-        return "results/{date}/filtered-calls/ref~{reference}/{sample}.{filter}.bcf"
+        return "results/{date}/filtered-calls/ref~{reference}/annot~{annotation}/{sample}.{filter}.bcf"
     else:
         # If reference is not main, we are polishing an assembly.
         # Here, there is no need to structural variants or annotation based filtering.
@@ -1590,7 +1590,7 @@ def get_input_by_mode(wildcard):
             mode=["major", "any"],
         ),
         zip_expand(
-            "results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf",
+            "results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.orf.bcf",
             zip_wildcard_1=get_dates(),
             zip_wildcard_2=get_samples(),
             expand_wildcard=config["variant-calling"]["filters"],
@@ -1642,10 +1642,25 @@ def get_pangolin_for_report(wildcards):
     return paths
 
 
+def get_genome_annotation(suffix=""):
+    def inner(wildcards):
+        if wildcards.annotation == "orf":
+            return f"resources/annotation.gff.gz{suffix}"
+        elif wildcards.annotation == "protein":
+            return f"resources/protein_products.gff.gz{suffix}"
+
+        raise AttributeError(
+            f"Config for annotation '{wildcards.annotation}' not recognizied. Can be either 'orf' or 'protein'."
+        )
+
+    return inner
+
+
 wildcard_constraints:
     sample="[^/.]+",
     vartype="|".join(VARTYPES),
     clonality="subclonal|clonal",
+    annotation="orf|protein",
     filter="|".join(
         list(map(re.escape, config["variant-calling"]["filters"])) + ["nofilter"]
     ),

diff --git a/workflow/rules/generate_output.smk b/workflow/rules/generate_output.smk
@@ -141,7 +141,7 @@ rule overview_table_patient_csv:
         kraken=get_kraken_output,
         pangolin=get_pangolin_for_report,
         bcf=expand_samples_for_date(
-            "results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
+            "results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf",
         ),
         # Added because WorkflowError: Rule parameter depends on checkpoint but checkpoint output is not defined
         # as input file for the rule. Please add the output of the respective checkpoint to the rule inputs.
@@ -172,7 +172,7 @@ use rule overview_table_patient_csv as overview_table_environment_csv with:
             "results/{{date}}/tables/read_pair_counts/{sample}.txt",
         ),
         bcf=expand_samples_for_date(
-            "results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
+            "results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf",
         ),
     output:
         qc_data="results/{date}/tables/environment-overview.csv",
@@ -238,7 +238,7 @@ rule filter_overview_html:
             htmlindex="index.html",
             caption="../report/filter-overview.rst",
             category="4. Sequences",
-            subcategory="0. Quality Overview",
+            subcategory=" Quality Overview",
         ),
     params:
         pin_until="Sample",
@@ -279,13 +279,13 @@ rule plot_lineages_over_time:
 rule plot_variants_over_time:
     input:
         bcf=lambda wildcards: expand(
-            "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
+            "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf",
             zip,
             date=get_dates_before_date(wildcards),
             sample=get_samples_before_date(wildcards),
         ),
         csi=lambda wildcards: expand(
-            "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf.csi",
+            "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf.csi",
             zip,
             date=get_dates_before_date(wildcards),
             sample=get_samples_before_date(wildcards),
@@ -338,7 +338,7 @@ rule pangolin_call_overview_html:
             htmlindex="index.html",
             caption="../report/pangolin-call-overview.rst",
             category="4. Sequences",
-            subcategory="0. Quality Overview",
+            subcategory=" Quality Overview",
         ),
     params:
         pin_until="Sample",
@@ -372,9 +372,10 @@ rule snakemake_reports_patient:
             ["results/{{date}}/lineage-variant-report/{sample}.lineage-variants"]
         ),
         lambda wildcards: expand(
-            "results/{{date}}/vcf-report/{target}.{filter}",
+            "results/{{date}}/vcf-report/{target}.{filter}.{annotation}",
             target=get_samples_for_date(wildcards.date) + ["all"],
             filter=config["variant-calling"]["filters"],
+            annotation=config["variant-calling"]["annotations"],
         ),
         # 3. Sequencing Details
         "results/{date}/qc/laboratory/multiqc.html",
@@ -394,8 +395,9 @@ rule snakemake_reports_patient:
         ),
         # 5. Variant Call Files
         expand(
-            ["results/{{date}}/ucsc-vcfs/all.{{date}}.{filter}.vcf"],
+            ["results/{{date}}/ucsc-vcfs/all.{{date}}.{filter}.{annotation}.vcf"],
             filter=config["variant-calling"]["filters"],
+            annotation=config["variant-calling"]["annotations"],
         ),
         # 6. High Quality Genomes
         "results/high-quality-genomes/{date}.fasta",
@@ -430,9 +432,10 @@ use rule snakemake_reports_patient as snakemake_reports_environment with:
             ["results/{{date}}/lineage-variant-report/{sample}.lineage-variants"]
         ),
         lambda wildcards: expand(
-            "results/{{date}}/vcf-report/{target}.{filter}",
+            "results/{{date}}/vcf-report/{target}.{filter}.{annotation}",
             target=get_samples_for_date(wildcards.date) + ["all"],
             filter=config["variant-calling"]["filters"],
+            annotation=config["variant-calling"]["annotations"],
         ),
         # 3. Sequencing Details
         "results/{date}/qc/laboratory/multiqc.html",

diff --git a/workflow/rules/lineage_variant_calling.smk b/workflow/rules/lineage_variant_calling.smk
@@ -59,7 +59,7 @@ use rule overview_table_html as generate_lineage_variant_report with:
             htmlindex="index.html",
             caption="../report/lineage-variant-report.rst",
             category="2. Variant Call Details",
-            subcategory="1. VOC Similarity",
+            subcategory=" VOC Similarity",
         ),
     log:
         "logs/{date}/lineage-variant-report/{sample}.log",

diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk
@@ -149,8 +149,8 @@ use rule assembly_polishing_ont as medaka_consensus_reference with:
 rule bcftools_consensus_ont:
     input:
         fasta="results/{date}/consensus/medaka/{sample}/{sample}.fasta",
-        bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf",  # clonal vs. subclonal?
-        bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi",
+        bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf",  # clonal vs. subclonal?
+        bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf.csi",
     output:
         temp("results/{date}/consensus/bcftools/{sample}.fasta"),
     log:

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
@@ -48,6 +48,88 @@ rule get_genome_annotation:
         "zcat | grep -v '#' | sort -k1,1 -k4,4n -k5,5n -t$'\t' | bgzip -c > {output}) 2> {log}"
 
 
+rule download_protein_products:
+    output:
+        temp("resources/protein_products.bed"),
+    log:
+        "logs/download_protein_products.log",
+    conda:
+        "../envs/ucsc.yaml"
+    shell:
+        "(bigBedToBed http://hgdownload.soe.ucsc.edu/gbdb/wuhCor1/uniprot/unipChainCov2.bb"
+        " -chrom=NC_045512v2 -start=0 -end=29903 {output})"
+        "2>{log}"
+
+
+rule bed2gff:
+    input:
+        "resources/protein_products.bed",
+    output:
+        temp("resources/protein_products.gff"),
+    log:
+        "logs/bed2gff3.log",
+    conda:
+        "../envs/genometools.yaml"
+    shell:
+        "(cut -f1-12 {input} | sed -e 's/ /-/g' | sed -e 's/NC_045512v2/NC_045512.2/g'"
+        " | gt bed_to_gff3 -featuretype gene -thicktype transcript -blocktype CDS -o {output} -force /dev/stdin )"
+        "2> {log}"
+
+
+rule filter_gff:
+    input:
+        "resources/protein_products.gff",
+    output:
+        temp("resources/protein_products.formatted.gff"),
+    log:
+        "logs/format_gff.log",
+    conda:
+        "../envs/tabix.yaml"
+    shell:
+        # download, sort and bgzip gff (see https://www.ensembl.org/info/docs/tools/vep/script/vep_custom.html)
+        "cat {input} | grep -v '#' > {output} 2> {log}"
+
+
+rule fix_gff:
+    input:
+        "resources/protein_products.formatted.gff",
+    output:
+        temp("resources/protein_products.fixed.gff"),
+    log:
+        "logs/fix_gff.log",
+    conda:
+        "../envs/python.yaml"
+    script:
+        "../scripts/fix-protein-gff.py"
+
+
+rule format_fixed_gff:
+    input:
+        "resources/protein_products.fixed.gff",
+    output:
+        temp("resources/protein_products.fixed.formatted.gff"),
+    log:
+        "logs/format_gff.log",
+    conda:
+        "../envs/tabix.yaml"
+    shell:
+        # download, sort and bgzip gff (see https://www.ensembl.org/info/docs/tools/vep/script/vep_custom.html)
+        "cat {input} | grep -v '#' | sort -k1,1 -k4,4n -k5,5n -k3,3n -t$'\t' > {output} 2> {log}"
+
+
+rule compress_gff:
+    input:
+        "resources/protein_products.fixed.formatted.gff",
+    output:
+        "resources/protein_products.gff.gz",
+    log:
+        "logs/compress_gff.log",
+    conda:
+        "../envs/tabix.yaml"
+    shell:
+        "bgzip -c {input} > {output} 2> {log}"
+
+
 rule get_genome_annotation_for_known_variants:
     output:
         "resources/annotation_known_variants.gff.gz",

diff --git a/workflow/rules/variant_annotation.smk b/workflow/rules/variant_annotation.smk
@@ -21,19 +21,19 @@ rule annotate_variants:
         plugins="resources/vep/plugins",
         fasta="resources/genomes/main.fasta",
         fai="resources/genomes/main.fasta.fai",
-        gff="resources/annotation.gff.gz",
-        csi="resources/annotation.gff.gz.tbi",
+        gff=get_genome_annotation(),
+        csi=get_genome_annotation(suffix=".tbi"),
         problematic="resources/problematic-sites.vcf.gz",
         problematic_tbi="resources/problematic-sites.vcf.gz.tbi",
     output:
-        calls="results/{date}/annotated-calls/ref~main/{sample}.bcf",
-        stats="results/{date}/annotated-calls/ref~main/{sample}.html",
+        calls="results/{date}/annotated-calls/ref~main/annot~{annotation}/{sample}.bcf",
+        stats="results/{date}/annotated-calls/ref~main/annot~{annotation}/{sample}.html",
     params:
         # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
         # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
         plugins=["LoFtool"],
         extra=get_vep_args,
     log:
-        "logs/{date}/vep/{sample}.log",
+        "logs/{date}/vep/{annotation}/{sample}.log",
     wrapper:
         "0.72.0/bio/vep/annotate"
diff --git a/workflow/rules/variant_filtration.smk b/workflow/rules/variant_filtration.smk
@@ -6,14 +6,16 @@
 
 rule vembrane_filter:
     input:
-        "results/{date}/annotated-calls/ref~main/{sample}.bcf",
+        "results/{date}/annotated-calls/ref~main/annot~{annotation}/{sample}.bcf",
     output:
-        temp("results/{date}/filtered-calls/ref~main/{sample}.{filter}.bcf"),
+        temp(
+            "results/{date}/filtered-calls/ref~main/annot~{annotation}/{sample}.{filter}.bcf"
+        ),
     params:
         expression=get_vembrane_expression,
         extra="",
     log:
-        "logs/{date}/vembrane/{sample}.{filter}.log",
+        "logs/{date}/vembrane/{sample}.{filter}.{annotation}.log",
     wrapper:
         "0.71.1/bio/vembrane/filter"
 
@@ -23,13 +25,13 @@ rule control_fdr:
         get_control_fdr_input,
     output:
         temp(
-            "results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.fdr-controlled.bcf"
+            "results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.{annotation}.fdr-controlled.bcf"
         ),
     params:
         fdr=config["variant-calling"]["fdr"],
         events=get_target_events,
     log:
-        "logs/{date}/control-fdr/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.log",
+        "logs/{date}/control-fdr/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.{annotation}.log",
     conda:
         "../envs/varlociraptor.yaml"
     shell:
@@ -42,9 +44,9 @@ rule merge_calls:
         calls=get_merge_calls_input(".bcf"),
         idx=get_merge_calls_input(".bcf.csi"),
     output:
-        "results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.bcf",
+        "results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.{annotation}.bcf",
     log:
-        "logs/{date}/merge-calls/ref~{reference}/{sample}.{clonality}.{filter}.log",
+        "logs/{date}/merge-calls/ref~{reference}/{sample}.{clonality}.{filter}.{annotation}.log",
     params:
         "-a -Ob",
     wrapper: