Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add variant call details based on protein annotation #487

Merged
merged 13 commits into from
Mar 4, 2022
6 changes: 6 additions & 0 deletions .tests/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ preprocessing:
amplicon-reference: "MN908947"

variant-calling:
# genome annotation to use. Can be
# 'orf' (e.g. ORF1ab, S, etc..) and/or
# 'protein' (e.g. nsp1, Hel, Spike Protein S1, etc..)
annotations:
- "orf"
- "protein"
# false discovery rate to control for
fdr: 0.05
# downsample loci to this read depth
Expand Down
6 changes: 6 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ assembly:
min-variant-prob: 0.95

variant-calling:
# genome annotation to use. Can be
# 'orf' (e.g. ORF1ab, S, etc..) and/or
# 'protein' (e.g. nsp1, Hel, Spike Protein S1, etc..)
annotations:
- "orf"
- "protein"
# false discovery rate to control for
fdr: 0.05
# downsample loci to this read depth
Expand Down
5 changes: 5 additions & 0 deletions workflow/envs/genometools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- bioconda
- conda-forge
dependencies:
- genometools-genometools =1.6.2
5 changes: 5 additions & 0 deletions workflow/envs/gff3sort.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- bioconda
- conda-forge
dependencies:
- gff3sort = 0.1
5 changes: 5 additions & 0 deletions workflow/envs/ucsc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- bioconda
- conda-forge
dependencies:
- ucsc-bigbedtobed =377
4 changes: 2 additions & 2 deletions workflow/rules/assembly.smk
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ rule filter_chr0:
rule assembly_polishing_illumina:
input:
fasta="results/{date}/contigs/ordered/{sample}.fasta",
bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.bcf",
bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.bcf.csi",
bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.orf.bcf",
bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.clonal.nofilter.orf.bcf.csi",
output:
report(
"results/{date}/polishing/bcftools-illumina/{sample}.fasta",
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/benchmarking_common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def get_test_cases_variant_calls(technology, suffix="", get="path"):
len(sample_table) == 1
), f'Too many sampels are defined with technology "{technology}" for test case {wildcards.test_case}.'

bcf_path_high = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf{suffix}"
bcf_path_low = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.low-impact.bcf{suffix}"
bcf_path_high = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf{suffix}"
bcf_path_low = "results/{date}/filtered-calls/ref~main/{sample}.subclonal.low-impact.orf.bcf{suffix}"

high_impact = expand(
bcf_path_high,
Expand Down
21 changes: 18 additions & 3 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def get_report_samples(wildcards):
def get_merge_calls_input(suffix):
def inner(wildcards):
return expand(
"results/{{date}}/filtered-calls/ref~{{reference}}/{{sample}}.{{clonality}}.{{filter}}.{vartype}.fdr-controlled{suffix}",
"results/{{date}}/filtered-calls/ref~{{reference}}/{{sample}}.{{clonality}}.{{filter}}.{vartype}.{{annotation}}.fdr-controlled{suffix}",
suffix=suffix,
vartype=VARTYPES,
)
Expand Down Expand Up @@ -602,7 +602,7 @@ def get_target_events(wildcards):

def get_control_fdr_input(wildcards):
if wildcards.reference == "main":
return "results/{date}/filtered-calls/ref~{reference}/{sample}.{filter}.bcf"
return "results/{date}/filtered-calls/ref~{reference}/annot~{annotation}/{sample}.{filter}.bcf"
else:
# If reference is not main, we are polishing an assembly.
# Here, there is no need to structural variants or annotation based filtering.
Expand Down Expand Up @@ -1590,7 +1590,7 @@ def get_input_by_mode(wildcard):
mode=["major", "any"],
),
zip_expand(
"results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.bcf",
"results/{zip1}/filtered-calls/ref~main/{zip2}.subclonal.{{exp}}.orf.bcf",
zip_wildcard_1=get_dates(),
zip_wildcard_2=get_samples(),
expand_wildcard=config["variant-calling"]["filters"],
Expand Down Expand Up @@ -1642,10 +1642,25 @@ def get_pangolin_for_report(wildcards):
return paths


def get_genome_annotation(suffix=""):
def inner(wildcards):
if wildcards.annotation == "orf":
return f"resources/annotation.gff.gz{suffix}"
elif wildcards.annotation == "protein":
return f"resources/protein_products.gff.gz{suffix}"

raise AttributeError(
f"Config for annotation '{wildcards.annotation}' not recognizied. Can be either 'orf' or 'protein'."
)

return inner


wildcard_constraints:
sample="[^/.]+",
vartype="|".join(VARTYPES),
clonality="subclonal|clonal",
annotation="orf|protein",
filter="|".join(
list(map(re.escape, config["variant-calling"]["filters"])) + ["nofilter"]
),
Expand Down
21 changes: 12 additions & 9 deletions workflow/rules/generate_output.smk
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ rule overview_table_patient_csv:
kraken=get_kraken_output,
pangolin=get_pangolin_for_report,
bcf=expand_samples_for_date(
"results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
"results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf",
),
# Added because WorkflowError: Rule parameter depends on checkpoint but checkpoint output is not defined
# as input file for the rule. Please add the output of the respective checkpoint to the rule inputs.
Expand Down Expand Up @@ -172,7 +172,7 @@ use rule overview_table_patient_csv as overview_table_environment_csv with:
"results/{{date}}/tables/read_pair_counts/{sample}.txt",
),
bcf=expand_samples_for_date(
"results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
"results/{{date}}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf",
),
output:
qc_data="results/{date}/tables/environment-overview.csv",
Expand Down Expand Up @@ -238,7 +238,7 @@ rule filter_overview_html:
htmlindex="index.html",
caption="../report/filter-overview.rst",
category="4. Sequences",
subcategory="0. Quality Overview",
subcategory=" Quality Overview",
),
params:
pin_until="Sample",
Expand Down Expand Up @@ -279,13 +279,13 @@ rule plot_lineages_over_time:
rule plot_variants_over_time:
input:
bcf=lambda wildcards: expand(
"results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf",
"results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf",
zip,
date=get_dates_before_date(wildcards),
sample=get_samples_before_date(wildcards),
),
csi=lambda wildcards: expand(
"results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.bcf.csi",
"results/{date}/filtered-calls/ref~main/{sample}.subclonal.high+moderate-impact.orf.bcf.csi",
zip,
date=get_dates_before_date(wildcards),
sample=get_samples_before_date(wildcards),
Expand Down Expand Up @@ -338,7 +338,7 @@ rule pangolin_call_overview_html:
htmlindex="index.html",
caption="../report/pangolin-call-overview.rst",
category="4. Sequences",
subcategory="0. Quality Overview",
subcategory=" Quality Overview",
),
params:
pin_until="Sample",
Expand Down Expand Up @@ -372,9 +372,10 @@ rule snakemake_reports_patient:
["results/{{date}}/lineage-variant-report/{sample}.lineage-variants"]
),
lambda wildcards: expand(
"results/{{date}}/vcf-report/{target}.{filter}",
"results/{{date}}/vcf-report/{target}.{filter}.{annotation}",
target=get_samples_for_date(wildcards.date) + ["all"],
filter=config["variant-calling"]["filters"],
annotation=config["variant-calling"]["annotations"],
),
# 3. Sequencing Details
"results/{date}/qc/laboratory/multiqc.html",
Expand All @@ -394,8 +395,9 @@ rule snakemake_reports_patient:
),
# 5. Variant Call Files
expand(
["results/{{date}}/ucsc-vcfs/all.{{date}}.{filter}.vcf"],
["results/{{date}}/ucsc-vcfs/all.{{date}}.{filter}.{annotation}.vcf"],
filter=config["variant-calling"]["filters"],
annotation=config["variant-calling"]["annotations"],
),
# 6. High Quality Genomes
"results/high-quality-genomes/{date}.fasta",
Expand Down Expand Up @@ -430,9 +432,10 @@ use rule snakemake_reports_patient as snakemake_reports_environment with:
["results/{{date}}/lineage-variant-report/{sample}.lineage-variants"]
),
lambda wildcards: expand(
"results/{{date}}/vcf-report/{target}.{filter}",
"results/{{date}}/vcf-report/{target}.{filter}.{annotation}",
target=get_samples_for_date(wildcards.date) + ["all"],
filter=config["variant-calling"]["filters"],
annotation=config["variant-calling"]["annotations"],
),
# 3. Sequencing Details
"results/{date}/qc/laboratory/multiqc.html",
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/lineage_variant_calling.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ use rule overview_table_html as generate_lineage_variant_report with:
htmlindex="index.html",
caption="../report/lineage-variant-report.rst",
category="2. Variant Call Details",
subcategory="1. VOC Similarity",
subcategory=" VOC Similarity",
),
log:
"logs/{date}/lineage-variant-report/{sample}.log",
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/long_read.smk
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ use rule assembly_polishing_ont as medaka_consensus_reference with:
rule bcftools_consensus_ont:
input:
fasta="results/{date}/consensus/medaka/{sample}/{sample}.fasta",
bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf", # clonal vs. subclonal?
bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.bcf.csi",
bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf", # clonal vs. subclonal?
bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf.csi",
output:
temp("results/{date}/consensus/bcftools/{sample}.fasta"),
log:
Expand Down
82 changes: 82 additions & 0 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,88 @@ rule get_genome_annotation:
"zcat | grep -v '#' | sort -k1,1 -k4,4n -k5,5n -t$'\t' | bgzip -c > {output}) 2> {log}"


rule download_protein_products:
output:
temp("resources/protein_products.bed"),
log:
"logs/download_protein_products.log",
conda:
"../envs/ucsc.yaml"
shell:
"(bigBedToBed http://hgdownload.soe.ucsc.edu/gbdb/wuhCor1/uniprot/unipChainCov2.bb"
" -chrom=NC_045512v2 -start=0 -end=29903 {output})"
"2>{log}"


rule bed2gff:
input:
"resources/protein_products.bed",
output:
temp("resources/protein_products.gff"),
log:
"logs/bed2gff3.log",
conda:
"../envs/genometools.yaml"
shell:
"(cut -f1-12 {input} | sed -e 's/ /-/g' | sed -e 's/NC_045512v2/NC_045512.2/g'"
" | gt bed_to_gff3 -featuretype gene -thicktype transcript -blocktype CDS -o {output} -force /dev/stdin )"
"2> {log}"


rule filter_gff:
input:
"resources/protein_products.gff",
output:
temp("resources/protein_products.formatted.gff"),
log:
"logs/format_gff.log",
conda:
"../envs/tabix.yaml"
shell:
# download, sort and bgzip gff (see https://www.ensembl.org/info/docs/tools/vep/script/vep_custom.html)
"cat {input} | grep -v '#' > {output} 2> {log}"


rule fix_gff:
input:
"resources/protein_products.formatted.gff",
output:
temp("resources/protein_products.fixed.gff"),
log:
"logs/fix_gff.log",
conda:
"../envs/python.yaml"
script:
"../scripts/fix-protein-gff.py"


rule format_fixed_gff:
input:
"resources/protein_products.fixed.gff",
output:
temp("resources/protein_products.fixed.formatted.gff"),
log:
"logs/format_gff.log",
conda:
"../envs/tabix.yaml"
shell:
# download, sort and bgzip gff (see https://www.ensembl.org/info/docs/tools/vep/script/vep_custom.html)
"cat {input} | grep -v '#' | sort -k1,1 -k4,4n -k5,5n -k3,3n -t$'\t' > {output} 2> {log}"


rule compress_gff:
input:
"resources/protein_products.fixed.formatted.gff",
output:
"resources/protein_products.gff.gz",
log:
"logs/compress_gff.log",
conda:
"../envs/tabix.yaml"
shell:
"bgzip -c {input} > {output} 2> {log}"


rule get_genome_annotation_for_known_variants:
output:
"resources/annotation_known_variants.gff.gz",
Expand Down
10 changes: 5 additions & 5 deletions workflow/rules/variant_annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ rule annotate_variants:
plugins="resources/vep/plugins",
fasta="resources/genomes/main.fasta",
fai="resources/genomes/main.fasta.fai",
gff="resources/annotation.gff.gz",
csi="resources/annotation.gff.gz.tbi",
gff=get_genome_annotation(),
csi=get_genome_annotation(suffix=".tbi"),
problematic="resources/problematic-sites.vcf.gz",
problematic_tbi="resources/problematic-sites.vcf.gz.tbi",
output:
calls="results/{date}/annotated-calls/ref~main/{sample}.bcf",
stats="results/{date}/annotated-calls/ref~main/{sample}.html",
calls="results/{date}/annotated-calls/ref~main/annot~{annotation}/{sample}.bcf",
stats="results/{date}/annotated-calls/ref~main/annot~{annotation}/{sample}.html",
params:
# Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
# Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
plugins=["LoFtool"],
extra=get_vep_args,
log:
"logs/{date}/vep/{sample}.log",
"logs/{date}/vep/{annotation}/{sample}.log",
wrapper:
"0.72.0/bio/vep/annotate"
16 changes: 9 additions & 7 deletions workflow/rules/variant_filtration.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@

rule vembrane_filter:
input:
"results/{date}/annotated-calls/ref~main/{sample}.bcf",
"results/{date}/annotated-calls/ref~main/annot~{annotation}/{sample}.bcf",
output:
temp("results/{date}/filtered-calls/ref~main/{sample}.{filter}.bcf"),
temp(
"results/{date}/filtered-calls/ref~main/annot~{annotation}/{sample}.{filter}.bcf"
),
params:
expression=get_vembrane_expression,
extra="",
log:
"logs/{date}/vembrane/{sample}.{filter}.log",
"logs/{date}/vembrane/{sample}.{filter}.{annotation}.log",
wrapper:
"0.71.1/bio/vembrane/filter"

Expand All @@ -23,13 +25,13 @@ rule control_fdr:
get_control_fdr_input,
output:
temp(
"results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.fdr-controlled.bcf"
"results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.{annotation}.fdr-controlled.bcf"
),
params:
fdr=config["variant-calling"]["fdr"],
events=get_target_events,
log:
"logs/{date}/control-fdr/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.log",
"logs/{date}/control-fdr/ref~{reference}/{sample}.{clonality}.{filter}.{vartype}.{annotation}.log",
conda:
"../envs/varlociraptor.yaml"
shell:
Expand All @@ -42,9 +44,9 @@ rule merge_calls:
calls=get_merge_calls_input(".bcf"),
idx=get_merge_calls_input(".bcf.csi"),
output:
"results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.bcf",
"results/{date}/filtered-calls/ref~{reference}/{sample}.{clonality}.{filter}.{annotation}.bcf",
log:
"logs/{date}/merge-calls/ref~{reference}/{sample}.{clonality}.{filter}.log",
"logs/{date}/merge-calls/ref~{reference}/{sample}.{clonality}.{filter}.{annotation}.log",
params:
"-a -Ob",
wrapper:
Expand Down
Loading