perf: ont downsampling normalisation (#489)

* wrote new script map_bin_amp_sampler.py for 'normalizing' reducing/coverage on each amplicon to a common cutoff and started modifying rule long_read.smk to do the reuired mapping and run the script * added write out and convert of picked reads from source fastq to out-fasta * change in- and output file-extensions for all rules downstream of cap_cov_amp from fastq to fasta; changed also in get_reads_by_stage * fixed several snakemake related bugs * multiple restorations to repair broken workflow * established custom script for trimming of primers work-title: map-trim * improved coverage-capping amp_covcap_sampler.py and amplicon primer-trimming map_trim.py scripts, including reverse strand swap for trimming * added improved logic for clipping of '-' strand aligning reads * use only one map-trim to remove all, reinstated porechop; multiple adaptations and fixes in assembly.smk, common.smk, long_read.smk, qc.smk, amp_covcap_sampler.py, map_trim.py; repaired workflow and changed naming patterns and formatting for ont; medaka rule -v; get_reads_by_stage, get trimmed reads, species diversity_before_se, seqtk.yaml * remove map-trim in order to reorganize * perform all clipping and downsampling with notramp * multiple renamings and adaptions * cleanup * adjusted output canu_correct * linting * get_artic_primer * increased notramp version; reroute stdout+stderr to log in canu_correct * restored get_artic_primers to get the bed file * get_output_dir comments * restore sample-table * envs version nums * envs conda-forge * resolving comments * updated primer paths * rmv amp_covcap_sampler.py * canu_correct: moved corr_gz from output to parameters * nanofilt accept de/compressed * change gzip to compressed * with corr_gz in output * take out bcf rules * orf * echo test * remove parenthesis * added gzip to nanofilt env and replace gunzip with gzip in rule nanofilt * add file to nanofilt env * set corMemory for testing to 6GB * corona-spades in long_read.smk * solved all issues with wrong input-files for ont de-novo assembly * cleanup and temp-notes * restored sample-table, removed echos, removed hard-coded path * resolve log issue * notramp env * undo changes in test config * undo changes in test primer * add missing version in nanofilt.yaml * add remvoed temp flags back in * rmv whitespace * update env file * add canu logging * update config * add temp * rmv hardcoded date path * add logs Co-authored-by: Thomas Battenfeld <[email protected]> Co-authored-by: Alexander Thomas <[email protected]> Co-authored-by: Thomas Battenfeld <[email protected]>
IKIM-Essen · Mar 15, 2022 · f502dd2 · f502dd2
1 parent f44942d
commit f502dd2
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 1,343 deletions.
diff --git a/resources/ARTIC_v3_adapters.py b/resources/ARTIC_v3_adapters.py
diff --git a/workflow/envs/canu.yaml b/workflow/envs/canu.yaml
@@ -2,5 +2,6 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - canu =2.2
+  # do not use canu 2.2 !
+  - canu =2.1.1
   - minimap2 =2.22
diff --git a/workflow/envs/nanofilt.yaml b/workflow/envs/nanofilt.yaml
@@ -3,3 +3,5 @@ channels:
   - conda-forge
 dependencies:
   - nanofilt =2.8
+  - gzip =1.11
+  - file =5.39
diff --git a/workflow/envs/primechop.yaml → workflow/envs/notramp.yaml b/workflow/envs/primechop.yaml → workflow/envs/notramp.yaml
@@ -1,6 +1,7 @@
 channels:
+  - simakro
   - bioconda
   - conda-forge
 dependencies:
-  - porechop =0.2
-  - python =3.9
+  - notramp =1.0.5
+  - minimap2 =2.22
diff --git a/workflow/envs/porechop.yaml → workflow/envs/seqtk.yaml b/workflow/envs/porechop.yaml → workflow/envs/seqtk.yaml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - porechop =0.2
+  - seqtk =1.3
diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk
@@ -179,11 +179,11 @@ rule assembly_polishing_illumina:
 # polish ont de novo assembly
 rule assembly_polishing_ont:
     input:
-        fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz",
+        fasta="results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta",
         reference="results/{date}/contigs/ordered/{sample}.fasta",
     output:
         report(
-            "results/{date}/polishing/medaka/{sample}/{sample}.fasta",
+            "results/{date}/polishing/medaka/{sample}/consensus.fasta",
             category="4. Sequences",
             subcategory="1. De Novo Assembled Sequences",
             caption="../report/assembly_ont.rst",
@@ -197,9 +197,7 @@ rule assembly_polishing_ont:
         "../envs/medaka.yaml"
     threads: 4
     shell:
-        "(medaka_consensus -v -f -i {input.fasta} -o {params.outdir} -d {input.reference} -t {threads} &&"
-        " mv {params.outdir}/consensus.fasta {output}) "
-        " > {log} 2>&1"
+        "medaka_consensus -f -i {input.fasta} -o {params.outdir} -d {input.reference} -t {threads} > {log} 2>&1"
 
 
 rule aggregate_polished_de_novo_sequences:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -407,7 +407,7 @@ def get_reads(wildcards):
         )
 
         ont_pattern = expand(
-            "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz",
+            "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta",
             **wildcards,
         )
 
@@ -458,7 +458,7 @@ def get_reads_after_qc(wildcards, read="both"):
             **wildcards,
         )
         ont_pattern = expand(
-            "results/{date}/nonhuman-reads/se/{sample}.fastq.gz", **wildcards
+            "results/{date}/nonhuman-reads/se/{sample}.fastq", **wildcards
         )
         ion_torrent_pattern = expand(
             "results/{date}/read-clipping/fastq/se/{sample}.fastq", **wildcards
@@ -1297,15 +1297,6 @@ def get_include_flag_for_date(wildcards):
         return [get_include_flag(sample) for sample in allsamplelist]
 
 
-def get_artic_primer(wildcards):
-    # TODO add more _adapters.py (not preferred) or
-    # add a script to generate them from a link to a bed file.
-    # The bed file can be found in the artic repo. Related to #356
-    return "resources/ARTIC_v{}_adapters.py".format(
-        config["preprocessing"]["artic-primer-version"]
-    )
-
-
 def get_trimmed_reads(wildcards):
     """Returns paths of files of the trimmed reads for parsing by kraken."""
     return get_list_of_expanded_patters_by_technology(
@@ -1314,7 +1305,7 @@ def get_trimmed_reads(wildcards):
             "results/{{{{date}}}}/trimmed/fastp-pe/{{sample}}.{read}.fastq.gz",
             read=[1, 2],
         ),
-        ont_pattern="results/{{date}}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq.gz",
+        ont_pattern="results/{{date}}/corrected/{sample}/{sample}.correctedReads.fasta",
         ion_torrent_pattern="results/{{date}}/trimmed/fastp-se/{sample}.fastq.gz",
     )
 
@@ -1407,19 +1398,19 @@ def get_reads_by_stage(wildcards):
     if wildcards.stage == "raw":
         return get_fastqs(wildcards)
     elif wildcards.stage == "trimmed":
-        return "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq"
+        return "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta"
     elif wildcards.stage == "clipped":
-        return "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq"
+        return "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta"
     elif wildcards.stage == "filtered":
-        return "results/{date}/trimmed/nanofilt/{sample}.fastq"
+        return "results/{date}/trimmed/nanofilt/{sample}.fasta"
 
 
 def get_polished_sequence(wildcards):
     """Returns path to polished sequences, depend on sequencing technology."""
     return get_pattern_by_technology(
         wildcards,
         illumina_pattern="results/{date}/polishing/bcftools-illumina/{sample}.fasta",
-        ont_pattern="results/{date}/polishing/medaka/{sample}/{sample}.fasta",
+        ont_pattern="results/{date}/consensus/medaka/{sample}/consensus.fasta",
         ion_torrent_pattern="results/{date}/polishing/bcftools-illumina/{sample}.fasta",
     )
 

diff --git a/workflow/rules/long_read.smk b/workflow/rules/long_read.smk
@@ -1,4 +1,4 @@
-# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster.
+# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster, Simon Magin.
 # Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause)
 # This file may not be copied, modified, or distributed
 # except according to those terms.
@@ -32,123 +32,113 @@ rule count_fastq_reads:
         "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}"
 
 
-# Intermediate number of threads (4-8) achieve best speedup of a+btrimming.
-# For large files 8 threads help accelerate some, small files are processed faster with 4 threads.
-rule porechop_adapter_barcode_trimming:
+rule nanofilt:
     input:
         get_fastqs,
     output:
-        temp("results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq"),
-    conda:
-        "../envs/porechop.yaml"
+        temp("results/{date}/filtered/nanofilt/{sample}.fastq"),
     log:
-        "logs/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.log",
-    threads: 8
-    shell:
-        "porechop -i {input} -o {output} -t {threads} -v 1 > {log} 2>&1"
-
-
-rule customize_primer_porechop:
-    input:
-        get_artic_primer,
-    output:
-        temp("results/.indicators/replacement_notice.txt"),
+        "logs/{date}/nanofilt/{sample}.log",
+    params:
+        min_length=config["quality-criteria"]["ont"]["min-length-reads"],
+        min_PHRED=config["quality-criteria"]["ont"]["min-PHRED"],
     conda:
-        "../envs/primechop.yaml"
-    log:
-        "logs/customize_primer_porechop.log",
+        "../envs/nanofilt.yaml"
     shell:
-        "(cp {input} $CONDA_PREFIX/lib/python3.9/site-packages/porechop/adapters.py && "
-        'echo "replaced adpaters in adapter.py file in $CONDA_PREFIX/lib/python3.9/site-packages/porechop/adapters.py with ARTICv3 primers" > {output}) '
-        "2> {log}"
+        """
+        if file {input} | grep -q compressed ; then
+            gzip -d -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log}
+        else
+            NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log}
+        fi
+        """
 
 
-# Using a low number of threads (2-4) speed up primer-trimming significantly (>2x), even for large files,
-# presumably due to the much higher number of target-sequences for trimming as compared
-# to barcode+adapter-trimming. However, using only one thread is again very slow.
-rule porechop_primer_trimming:
+rule downsample_and_trim_raw:
     input:
-        fastq_in=(
-            "results/{date}/trimmed/porechop/adapter_barcode_trimming/{sample}.fastq"
-        ),
-        repl_flag="results/.indicators/replacement_notice.txt",
+        primer=config["preprocessing"]["amplicon-primers"],
+        reads="results/{date}/filtered/nanofilt/{sample}.fastq",
+        ref_genome="resources/genomes/main.fasta",
     output:
-        temp("results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq"),
-    conda:
-        "../envs/primechop.yaml"
+        "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta",
+        "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta",
+    params:
+        outdir=get_output_dir,
     log:
-        "logs/{date}/trimmed/porechop/primer_clipped/{sample}.log",
-    threads: 2
+        "results/{date}/norm_trim_raw_reads/{sample}/notramp.log",
+    conda:
+        "../envs/notramp.yaml"
     shell:
-        "(porechop -i {input.fastq_in} -o {output} --no_split --end_size 35 --extra_end_trim 0 -t {threads} -v 1) > {log} 2>&1"
+        "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}  2> {log}"
 
 
-rule nanofilt:
+use rule assembly_polishing_ont as medaka_consensus_reference with:
     input:
-        "results/{date}/trimmed/porechop/primer_clipped/{sample}.fastq",
+        # Don´t ever use corrected reads as input for medaka, it is supposed to polish with raw-reads!
+        fasta="results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.clip.fasta",
+        reference="resources/genomes/main.fasta",
     output:
-        temp("results/{date}/trimmed/nanofilt/{sample}.fastq"),
-    log:
-        "logs/{date}/nanofilt/{sample}.log",
-    params:
-        min_length=config["quality-criteria"]["ont"]["min-length-reads"],
-        min_PHRED=config["quality-criteria"]["ont"]["min-PHRED"],
-    conda:
-        "../envs/nanofilt.yaml"
-    shell:
-        "NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 500 {input} > {output} 2> {log}"
+        "results/{date}/consensus/medaka/{sample}/consensus.fasta",
 
 
 rule canu_correct:
     input:
-        "results/{date}/trimmed/nanofilt/{sample}.fastq",
+        "results/{date}/norm_trim_raw_reads/{sample}/{sample}.cap.fasta",
     output:
-        "results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz",
         temp(directory("results/{date}/corrected/{sample}/correction")),
         temp(directory("results/{date}/corrected/{sample}/{sample}.seqStore")),
+        corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz",
+        corr_fa="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta",
     log:
-        "logs/{date}/canu/assemble/{sample}.log",
+        "logs/{date}/canu/correct/{sample}.log",
     params:
         outdir=get_output_dir,
+        corr_gz="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz",
         concurrency=lambda w, threads: get_canu_concurrency(threads),
         min_length=config["quality-criteria"]["ont"]["min-length-reads"],
         for_testing=lambda w, threads: get_if_testing(
-            f"corThreads={threads} redMemory=6 oeaMemory=6"
+            f"corThreads={threads} corMemory=6 redMemory=6 oeaMemory=6"
         ),
     conda:
         "../envs/canu.yaml"
     threads: 16
     shell:
-        "( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi &&"
-        " canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k minOverlapLength=10 minReadLength=200"
-        " useGrid=false {params.for_testing}"
-        " corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000"
-        " corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap"
-        " corConcurrency={params.concurrency}"
-        " cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency}"
-        " cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency}"
-        " obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency}"
-        " utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency}"
-        " redConcurrency={params.concurrency} redThreads={params.concurrency}"
-        " ovbConcurrency={params.concurrency}"
-        " ovsConcurrency={params.concurrency}"
-        " oeaConcurrency={params.concurrency})"
+        "( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && "
+        "canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k "
+        "minOverlapLength=10 minReadLength=200 useGrid=false {params.for_testing} "
+        "corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 "
+        "corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap "
+        "corConcurrency={params.concurrency} ovbConcurrency={params.concurrency} "
+        "cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} "
+        "cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} "
+        "obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} "
+        "utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} "
+        "redConcurrency={params.concurrency} redThreads={params.concurrency} "
+        "ovsConcurrency={params.concurrency} oeaConcurrency={params.concurrency} && "
+        "gzip -d {params.corr_gz} --keep)"
         "> {log} 2>&1"
 
 
-# rule medaka_consensus_reference:
-use rule assembly_polishing_ont as medaka_consensus_reference with:
+rule clip_adbc_corrected:
     input:
-        fasta="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta.gz",
-        reference="resources/genomes/main.fasta",
+        primer=config["preprocessing"]["amplicon-primers"],
+        reads="results/{date}/corrected/{sample}/{sample}.correctedReads.fasta",
+        ref_genome="resources/genomes/main.fasta",
     output:
-        temp("results/{date}/consensus/medaka/{sample}/{sample}.fasta"),
+        "results/{date}/corrected/{sample}/{sample}.correctedReads.clip.fasta",
+    params:
+        outdir=get_output_dir,
+    log:
+        "results/{date}/corrected/{sample}/notramp.log",
+    conda:
+        "../envs/notramp.yaml"
+    shell:
+        "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}  2> {log}"
 
 
-# polish consensus
 rule bcftools_consensus_ont:
     input:
-        fasta="results/{date}/consensus/medaka/{sample}/{sample}.fasta",
+        fasta="results/{date}/consensus/medaka/{sample}/consensus.fasta",
         bcf="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf",  # clonal vs. subclonal?
         bcfidx="results/{date}/filtered-calls/ref~{sample}/{sample}.subclonal.high+moderate-impact.orf.bcf.csi",
     output:

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -153,9 +153,7 @@ rule species_diversity_before_se:
         kraken_output=temp(
             "results/{date}/species-diversity/se/{sample}/{sample}.kraken"
         ),
-        report=(
-            "results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2"
-        ),
+        report="results/{date}/species-diversity/se/{sample}/{sample}.uncleaned.kreport2",
     log:
         "logs/{date}/kraken/se/{sample}.log",
     threads: 8
@@ -236,7 +234,7 @@ rule order_nonhuman_reads_se:
     input:
         "results/{date}/mapped/ref~main+human/nonhuman/{sample}.bam",
     output:
-        fq=temp("results/{date}/nonhuman-reads/se/{sample}.fastq.gz"),
+        fq=temp("results/{date}/nonhuman-reads/se/{sample}.fastq"),
         bam_sorted=temp("results/{date}/nonhuman-reads/{sample}.sorted.bam"),
     log:
         "logs/{date}/order_nonhuman_reads/se/{sample}.log",
@@ -246,7 +244,7 @@ rule order_nonhuman_reads_se:
     shell:
         "(samtools sort  -@ {threads} -n {input} -o {output.bam_sorted}; "
         " samtools fastq -@ {threads} -0 {output.fq} {output.bam_sorted})"
-        " > {log} 2>&1"
+        "> {log} 2>&1"
 
 
 # analysis of species diversity present AFTER removing human contamination