From 00df793bbe841dbedc51d89705995383245033ec Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 15:20:22 +0200 Subject: [PATCH 01/11] Attempted adding post-map filter idxstats --- main.nf | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9e846c2ea..7ded1a23f 100644 --- a/main.nf +++ b/main.nf @@ -887,7 +887,7 @@ process samtools_filter { file bam from ch_mapped_reads_filter.mix(ch_mapped_reads_filter_cm,ch_bwamem_mapped_reads_filter) output: - file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk + file "*filtered.bam" into ch_bam_filtered_idxstats, ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk file "*.fastq.gz" optional true file "*.unmapped.bam" optional true file "*.{bai,csi}" @@ -959,6 +959,28 @@ process strip_input_fastq { } +/* +* Step 5b: Keep unmapped/remove unmapped reads idxstats +*/ + + +process samtools_idxstats_after_filter { + tag "$prefix" + publishDir "${params.outdir}/samtools/stats", mode: 'copy' + + input: + file(bam) from ch_bam_filtered_idxstats) + + output: + file "*.stats" into ch_filtered_idxstats_for_multiqc + + script: + prefix = "$bam" - ~/(\.bam)?$/ + """ + samtools flagstat $bam > ${prefix}.stats + """ +} + /* Step 6: DeDup / MarkDuplicates @@ -1283,6 +1305,7 @@ process multiqc { file ('software_versions/software_versions_mqc*') from software_versions_yaml.collect().ifEmpty([]) file ('adapter_removal/*') from ch_adapterremoval_logs.collect().ifEmpty([]) file ('idxstats/*') from ch_idxstats_for_multiqc.collect().ifEmpty([]) + file ('idxstats_filtered') from ch_filtered_idxstats_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_results.collect().ifEmpty([]) file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) From f537a74f718028b45eff9dad95443e62f8fcfbe2 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 15:30:18 +0200 Subject: [PATCH 02/11] Channel typo fix --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7ded1a23f..eb5615d2a 100644 --- a/main.nf +++ b/main.nf @@ -969,7 +969,7 @@ process samtools_idxstats_after_filter { publishDir "${params.outdir}/samtools/stats", mode: 'copy' input: - file(bam) from ch_bam_filtered_idxstats) + file(bam) from ch_mapped_reads_filtered_idxstats.mix(ch_mapped_reads_filtered_idxstats) output: file "*.stats" into ch_filtered_idxstats_for_multiqc From 82861ba7632a87e48c33e810f8e20cf29dc50c70 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 15:32:07 +0200 Subject: [PATCH 03/11] Typo input fix wrong channel name --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index eb5615d2a..ad42e6b82 100644 --- a/main.nf +++ b/main.nf @@ -969,7 +969,7 @@ process samtools_idxstats_after_filter { publishDir "${params.outdir}/samtools/stats", mode: 'copy' input: - file(bam) from ch_mapped_reads_filtered_idxstats.mix(ch_mapped_reads_filtered_idxstats) + file(bam) from ch_bam_filtered_idxstats.mix(ch_bam_filtered_idxstats) output: file "*.stats" into ch_filtered_idxstats_for_multiqc From d2613bfa0e14cb43496f8953d9712044a403a93d Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 15:42:33 +0200 Subject: [PATCH 04/11] Removed errorenous channel mixing --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ad42e6b82..9661e7f19 100644 --- a/main.nf +++ b/main.nf @@ -969,7 +969,7 @@ process samtools_idxstats_after_filter { publishDir "${params.outdir}/samtools/stats", mode: 'copy' input: - file(bam) from ch_bam_filtered_idxstats.mix(ch_bam_filtered_idxstats) + file(bam) from ch_bam_filtered_idxstats output: file "*.stats" into ch_filtered_idxstats_for_multiqc From 371bf58be6363f996e8e2a25340a67b8474ff2a9 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 15:56:12 +0200 Subject: [PATCH 05/11] Fixed idxstats to flagstat, added extra MultiQC module for post-samtools filter stats --- assets/multiqc_config.yaml | 13 +++++++++++-- main.nf | 28 ++++++++++++++-------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 0fe33abd9..d1c0a0e1f 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -14,7 +14,14 @@ top_modules: path_filters: - '*.truncated_fastqc.zip' - '*.combined*_fastqc.zip' - - 'samtools' + - 'samtools': + name: 'Samtools Flagstat (pre-samtools filter)' + path_filters: + - '*.sorted.stats' + - 'samtools': + name: 'Samtools Flagstat (post-samtools filter)' + path_filters: + - '*.sorted.bam.filtered.stats' - 'dedup' - 'preseq' - 'qualimap' @@ -67,8 +74,10 @@ table_columns_placement: total_sequences: 400 avg_sequence_length: 410 percent_gc: 420 - Samtools Flagstat: + Samtools Flagstat (pre-samtools filter): mapped_passed: 500 + Samtools Flagstat (post-samtools filter): + mapped_passed: 510 DeDup: clusterfactor: 600 duplication_rate: 610 diff --git a/main.nf b/main.nf index 9661e7f19..1829c9e74 100644 --- a/main.nf +++ b/main.nf @@ -713,7 +713,7 @@ process bwa { output: - file "*.sorted.bam" into ch_mapped_reads_idxstats,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler, ch_bwa_mapped_reads_strip + file "*.sorted.bam" into ch_mapped_reads_flagstat,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler, ch_bwa_mapped_reads_strip file "*.{bai,csi}" into ch_bam_index_for_damageprofiler @@ -780,7 +780,7 @@ process circularmapper{ file fasta from fasta_for_indexing output: - file "*.sorted.bam" into ch_mapped_reads_idxstats_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm, ch_circular_mapped_reads_strip + file "*.sorted.bam" into ch_mapped_reads_flagstat_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm, ch_circular_mapped_reads_strip file "*.{bai,csi}" script: @@ -824,7 +824,7 @@ process bwamem { file index from bwa_index_bwamem output: - file "*.sorted.bam" into ch_bwamem_mapped_reads_idxstats,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler, ch_bwamem_mapped_reads_strip + file "*.sorted.bam" into ch_bwamem_mapped_reads_flagstat,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler, ch_bwamem_mapped_reads_strip file "*.{bai,csi}" @@ -848,18 +848,18 @@ process bwamem { } /* -* Step 4 - IDXStats +* Step 4 - flagstat */ -process samtools_idxstats { +process samtools_flagstat { tag "$prefix" publishDir "${params.outdir}/samtools/stats", mode: 'copy' input: - file(bam) from ch_mapped_reads_idxstats.mix(ch_mapped_reads_idxstats_cm,ch_bwamem_mapped_reads_idxstats) + file(bam) from ch_mapped_reads_flagstat.mix(ch_mapped_reads_flagstat_cm,ch_bwamem_mapped_reads_flagstat) output: - file "*.stats" into ch_idxstats_for_multiqc + file "*.stats" into ch_flagstat_for_multiqc script: prefix = "$bam" - ~/(\.bam)?$/ @@ -887,7 +887,7 @@ process samtools_filter { file bam from ch_mapped_reads_filter.mix(ch_mapped_reads_filter_cm,ch_bwamem_mapped_reads_filter) output: - file "*filtered.bam" into ch_bam_filtered_idxstats, ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk + file "*filtered.bam" into ch_bam_filtered_flagstat, ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk file "*.fastq.gz" optional true file "*.unmapped.bam" optional true file "*.{bai,csi}" @@ -960,19 +960,19 @@ process strip_input_fastq { } /* -* Step 5b: Keep unmapped/remove unmapped reads idxstats +* Step 5b: Keep unmapped/remove unmapped reads flagstat */ -process samtools_idxstats_after_filter { +process samtools_flagstat_after_filter { tag "$prefix" publishDir "${params.outdir}/samtools/stats", mode: 'copy' input: - file(bam) from ch_bam_filtered_idxstats + file(bam) from ch_bam_filtered_flagstat output: - file "*.stats" into ch_filtered_idxstats_for_multiqc + file "*.stats" into ch_filtered_flagstat_for_multiqc script: prefix = "$bam" - ~/(\.bam)?$/ @@ -1304,8 +1304,8 @@ process multiqc { file('fastqc/*') from ch_fastqc_after_clipping.collect().ifEmpty([]) file ('software_versions/software_versions_mqc*') from software_versions_yaml.collect().ifEmpty([]) file ('adapter_removal/*') from ch_adapterremoval_logs.collect().ifEmpty([]) - file ('idxstats/*') from ch_idxstats_for_multiqc.collect().ifEmpty([]) - file ('idxstats_filtered') from ch_filtered_idxstats_for_multiqc.collect().ifEmpty([]) + file ('flagstat/*') from ch_flagstat_for_multiqc.collect().ifEmpty([]) + file ('flagstat_filtered') from ch_filtered_flagstat_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_results.collect().ifEmpty([]) file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) From d0b4c9a68ba27b18aa0fb573bcf1a33c29e4ac50 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 16:26:56 +0200 Subject: [PATCH 06/11] Fixed tab typos in multiqc config --- assets/multiqc_config.yaml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index d1c0a0e1f..34fb017d3 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -10,18 +10,18 @@ top_modules: - 'fastp' - 'adapterRemoval' - 'fastqc': - name: 'FastQC (post-AdapterRemoval)' - path_filters: - - '*.truncated_fastqc.zip' - - '*.combined*_fastqc.zip' + name: 'FastQC (post-AdapterRemoval)' + path_filters: + - '*.truncated_fastqc.zip' + - '*.combined*_fastqc.zip' - 'samtools': - name: 'Samtools Flagstat (pre-samtools filter)' - path_filters: - - '*.sorted.stats' + name: 'Samtools Flagstat (pre-samtools filter)' + path_filters: + - '*.sorted.stats' - 'samtools': - name: 'Samtools Flagstat (post-samtools filter)' - path_filters: - - '*.sorted.bam.filtered.stats' + name: 'Samtools Flagstat (post-samtools filter)' + path_filters: + - '*.sorted.bam.filtered.stats' - 'dedup' - 'preseq' - 'qualimap' @@ -54,7 +54,7 @@ table_columns_visible: 1_x_pc: True 5_x_pc: True percentage_aligned: False - DamageProfiler:: + DamageProfiler: 3 Prime1: True 3 Prime2: True 5 Prime1: True @@ -78,7 +78,7 @@ table_columns_placement: mapped_passed: 500 Samtools Flagstat (post-samtools filter): mapped_passed: 510 - DeDup: + DeDup: clusterfactor: 600 duplication_rate: 610 QualiMap: @@ -93,7 +93,7 @@ table_columns_placement: mtreads: 800 mt_cov_avg: 810 mt_nuc_ratio: 820 - DamageProfiler:: + DamageProfiler: 3 Prime1: 900 3 Prime2: 910 5 Prime1: 920 From 7fe7375b5e04f094d01f1341fdc9ae361a5d1d76 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 16:38:53 +0200 Subject: [PATCH 07/11] Fixed tab typos in multiqc config --- assets/multiqc_config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 34fb017d3..d200d78ef 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -15,13 +15,13 @@ top_modules: - '*.truncated_fastqc.zip' - '*.combined*_fastqc.zip' - 'samtools': - name: 'Samtools Flagstat (pre-samtools filter)' - path_filters: - - '*.sorted.stats' + name: 'Samtools Flagstat (pre-samtools filter)' + path_filters: + - '*.sorted.stats' - 'samtools': - name: 'Samtools Flagstat (post-samtools filter)' - path_filters: - - '*.sorted.bam.filtered.stats' + name: 'Samtools Flagstat (post-samtools filter)' + path_filters: + - '*.sorted.bam.filtered.stats' - 'dedup' - 'preseq' - 'qualimap' From 5bb7884fe672928d74e4bd4cee421cbcff991f3d Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 16:53:47 +0200 Subject: [PATCH 08/11] Fix incorrect multiqc file() channel --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 1829c9e74..dd3c3b85f 100644 --- a/main.nf +++ b/main.nf @@ -972,7 +972,7 @@ process samtools_flagstat_after_filter { file(bam) from ch_bam_filtered_flagstat output: - file "*.stats" into ch_filtered_flagstat_for_multiqc + file "*.stats" into ch_bam_filtered_flagstat_for_multiqc script: prefix = "$bam" - ~/(\.bam)?$/ @@ -1305,7 +1305,7 @@ process multiqc { file ('software_versions/software_versions_mqc*') from software_versions_yaml.collect().ifEmpty([]) file ('adapter_removal/*') from ch_adapterremoval_logs.collect().ifEmpty([]) file ('flagstat/*') from ch_flagstat_for_multiqc.collect().ifEmpty([]) - file ('flagstat_filtered') from ch_filtered_flagstat_for_multiqc.collect().ifEmpty([]) + file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_results.collect().ifEmpty([]) file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([]) file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([]) From f5eab0a417c34362e1d748f2f525eba3c062df28 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 19:43:55 +0200 Subject: [PATCH 09/11] Fixed missing samtools filter step names in general stats --- assets/multiqc_config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index d200d78ef..c8b77132d 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -50,6 +50,10 @@ table_columns_visible: percent_duplicates: False total_sequences: True percent_gc: True + Samtools Flagstat (pre-samtools filter): + mapped_passed: True + Samtools Flagstat (post-samtools filter): + mapped_passed: True QualiMap: 1_x_pc: True 5_x_pc: True From e778c7bae612b3aae5caa1be4257ad0a364b5228 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 20:22:20 +0200 Subject: [PATCH 10/11] Updated docs and changelog --- CHANGELOG.md | 1 + docs/output.md | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 501fca5a2..5a5146aba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * Merged in [nf-core/tools](https://github.com/nf-core/tools) release V1.6 template changes * A lot more automated tests using Travis CI * Don't ignore DamageProfiler errors anymore +* Added post-mapping filtering statistics module and corresponding MultiQC statistics [#217](https://github.com/nf-core/eager/issues/217) ### `Fixed` * [#152](https://github.com/nf-core/eager/pull/152) - DamageProfiler errors [won't crash entire pipeline anymore](https://github.com/nf-core/eager/issues/171) diff --git a/docs/output.md b/docs/output.md index a4e679362..379a36de8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -72,6 +72,7 @@ The default columns are as follows: * **Seqs** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. * **%GC** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. * **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering and deduplication. + * **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering and deduplication (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) * **Duplication Rate** This is from DeDup. This is the percentage of overall number of mapped reads that were an exact duplicate of another read. The number of reads removed by DeDup can be calculating this number by mapped reads (if no map quality filtering was applied!) * **Coverage** This is from Qualimap. This is the median number of times a base on your reference genome was covered by a read (i.e. depth coverage).. This average includes bases with 0 reads covering that position. * **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. @@ -233,6 +234,9 @@ The third row 'Mapped' represents the number of reads that found a place that co The remaining rows will be 0 when running `bwa aln` as these characteristucs of the data are not considered by the algorithm by default. +> **NB:** The Samtools (pre-samtools filter) plots displayed in the MultiQC report shows mapped reads without mapping quality filtering. This will contain reads that can map to multiple places on your reference genome with equal or slightly less mapping quality score. To see how your reads look after mapping quality, look at the FastQC reports in the Samtools (pre-samtools filter). You should expect after mapping quality filtering, that you will have less reads. + + ### DeDup #### Background DeDup is a duplicate removal tool which searchs for PCR duplicates and removes them from your BAM file. We remove these duplicates because otherwise you would be artificially increasing your coverage and subsequently confidence in genotyping, by considering these lab artefacts which are not biologically meaningful. DeDup looks for reads with the same start and end coordinates, and whether they have exactly the same sequence. The main difference of DeDup versus e.g. `samtools markduplicates` is that DeDup considers _both_ ends of a read, not just the start position, so it is more precise in removing actual duplicates without penalising often already low aDNA data. From 08c7fbf16df09da6195d8193ead123e1122a5753 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Sat, 8 Jun 2019 20:28:38 +0200 Subject: [PATCH 11/11] Pedantic clean up of section comment numbering --- main.nf | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/main.nf b/main.nf index dd3c3b85f..8181564fa 100644 --- a/main.nf +++ b/main.nf @@ -443,7 +443,7 @@ ${summary.collect { k,v -> "
$k
${v ?: '