Merge pull request #13 from Aratz/multiqc_multireport

Generate reports per run, per project and per lane
nf-core · May 30, 2024 · 9cb1d68 · 9cb1d68
2 parents e93baf9 + 02affeb
commit 9cb1d68
Show file tree

Hide file tree

Showing 22 changed files with 532 additions and 71 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ results/
 testing/
 testing*
 *.pyc
+.nf-test
+.nf-test.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c
 
 ### `Added`
 
+- [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane.
+
 ### `Fixed`
 
 ### `Dependencies`

diff --git a/README.md b/README.md
@@ -39,26 +39,19 @@
 > [!NOTE]
 > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
-<!-- TODO nf-core: Describe the minimum required steps to execute the pipeline, e.g. how to prepare samplesheets.
-     Explain what rows and columns represent. For instance (please edit as appropriate):
-
 First, prepare a samplesheet with your input data that looks as follows:
 
 `samplesheet.csv`:
 
 ```csv
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
+sample,lane,group,fastq_1,fastq_2,rundir
+CONTROL_REP1,1,GROUP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,200624_A00834_0183_BHMTFYDRXX
 ```
 
 Each row represents a fastq file (single-end) or a pair of fastq files (paired end).
 
--->
-
 Now, you can run the pipeline using:
 
-<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
-
 ```bash
 nextflow run nf-core/seqinspector \
    -profile <docker/singularity/.../institute> \
@@ -80,11 +73,11 @@ For more details about the output files and reports, please refer to the
 
 ## Credits
 
-nf-core/seqinspector was originally written by Adrien Coulier.
+nf-core/seqinspector was originally written by the Swedish [@NationalGenomicsInfrastructure](https://github.com/NationalGenomicsInfrastructure/).
 
 We thank the following people for their extensive assistance in the development of this pipeline:
 
-<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
+- [@mahesh-panchal](https://github.com/mahesh-panchal)
 
 ## Contributions and Support
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,lane,project,fastq_1,fastq_2,rundir
+sample,lane,group,fastq_1,fastq_2,rundir
 SAMPLE_PAIRED_END,1,P001,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,/path/to/rundir
 SAMPLE_SINGLE_END,2,P002,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,,/path/to/rundir
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -19,11 +19,11 @@
                 "errorMessage": "Lane ID must be a number",
                 "meta": ["lane"]
             },
-            "project": {
+            "group": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Project ID cannot contain spaces",
-                "meta": ["project"]
+                "errorMessage": "Group ID cannot contain spaces",
+                "meta": ["group"]
             },
             "fastq_1": {
                 "type": "string",
@@ -47,7 +47,7 @@
                 "meta": ["rundir"]
             }
         },
-        "required": ["sample", "lane", "fastq_1"],
+        "required": ["sample", "fastq_1"],
         "dependentRequired": {
             "fastq_2": ["fastq_1"]
         }

diff --git a/conf/modules.config b/conf/modules.config
@@ -22,7 +22,7 @@ process {
         ext.args = '--quiet'
     }
 
-    withName: 'MULTIQC' {
+    withName: 'MULTIQC_GLOBAL' {
         ext.args   = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
         publishDir = [
             path: { "${params.outdir}/multiqc" },
@@ -31,4 +31,78 @@ process {
         ]
     }
 
+    withName: 'MULTIQC_PER_LANE' {
+        ext.args   = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
+        publishDir = [
+            path: { "${params.outdir}/multiqc/lanes" },
+            mode: params.publish_dir_mode,
+            saveAs: {
+                filename ->
+                    switch (filename) {
+                        case 'versions.yml':
+                            null
+                            break
+                        case ~/\[LANE:\d+\]_multiqc_(report\.html|plots|data)/:
+                            def lane = (filename =~ /\[LANE:(\d+)\]_multiqc_(report\.html|plots|data)/)[0][1]
+                            def new_filename = filename.replaceFirst(
+                                "(?<prefix>.*)\\[LANE:${lane}\\]_(?<suffix>multiqc_(report\\.html|plots|data).*)",
+                                '${prefix}${suffix}')
+                            "L${lane}/${new_filename}"
+                            break
+                        default:
+                            filename
+                    }
+            }
+        ]
+    }
+
+    withName: 'MULTIQC_PER_GROUP' {
+        ext.args   = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
+        publishDir = [
+            path: { "${params.outdir}/multiqc/groups" },
+            mode: params.publish_dir_mode,
+            saveAs: {
+                filename ->
+                    switch (filename) {
+                        case 'versions.yml':
+                            null
+                            break
+                        case ~/\[GROUP:.+\]_multiqc_(report\.html|plots|data)/:
+                            def group = (filename =~ /\[GROUP:(.+)\]_multiqc_(report\.html|plots|data)/)[0][1]
+                            def new_filename = filename.replaceFirst(
+                                "(?<prefix>.*)\\[GROUP:${group}\\]_(?<suffix>multiqc_(report\\.html|plots|data).*)",
+                                '${prefix}${suffix}')
+                            "${group}/${new_filename}"
+                            break
+                        default:
+                            filename
+                    }
+            }
+        ]
+    }
+
+    withName: 'MULTIQC_PER_RUNDIR' {
+        ext.args   = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
+        publishDir = [
+            path: { "${params.outdir}/multiqc/rundirss" },
+            mode: params.publish_dir_mode,
+            saveAs: {
+                filename ->
+                    switch (filename) {
+                        case 'versions.yml':
+                            null
+                            break
+                        case ~/\[RUNDIR:.+\]_multiqc_(report\.html|plots|data)/:
+                            def rundir = (filename =~ /\[RUNDIR:(.+)\]_multiqc_(report\.html|plots|data)/)[0][1]
+                            def new_filename = filename.replaceFirst(
+                                "(?<prefix>.*)\\[RUNDIR:${rundir}\\]_(?<suffix>multiqc_(report\\.html|plots|data).*)",
+                                '${prefix}${suffix}')
+                            "${rundir}/${new_filename}"
+                            break
+                        default:
+                            filename
+                    }
+            }
+        ]
+    }
 }
diff --git a/docs/output.md b/docs/output.md
@@ -6,8 +6,6 @@ This document describes the output produced by the pipeline. Most of the plots a
 
 The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 
-<!-- TODO nf-core: Write this documentation describing your workflow's output -->
-
 ## Pipeline overview
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
@@ -48,6 +46,29 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m
   - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
   - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
   - `multiqc_plots/`: directory containing static images from the report in various formats.
+  - `lanes/` [1]
+    - `L1/`
+      - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
+      - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
+      - `multiqc_plots/`: directory containing static images from the report in various formats.
+    - `L2/`
+      - ...
+  - `groups/` [1]
+    - `GROUPNAME1/`
+      - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
+      - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
+      - `multiqc_plots/`: directory containing static images from the report in various formats.
+    - `GROUPNAME2/`
+      - ...
+  - `rundir/` [1]
+    - `RUNDIR1/`
+      - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
+      - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
+      - `multiqc_plots/`: directory containing static images from the report in various formats.
+    - `RUNDIR2/`
+      - ...
+
+[1] These files will only be generated if `lane`, `group` or `rundir` were specified for some samples.
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -10,47 +10,45 @@
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location.
 
 ```bash
 --input '[path to samplesheet file]'
 ```
 
-### Multiple runs of the same sample
+### Full samplesheet
 
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+The following simple run dir structure...
 
-```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
+```
+run_dir
+├── sample1_lane1_group1_r1.fq.gz
+├── sample2_lane1_group1_r1.fq.gz
+├── sample3_lane2_group2_r1.fq.gz
+└── sample4_lane2_group3_r1.fq.gz
 ```
 
-### Full samplesheet
-
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
-
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
+...would be represented in the following samplesheet (shown as .tsv for readability)
 
 ```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+sample  lane  group   fastq_1                                       fastq_2 rundir
+sample1 1     group1  path/to/run_dir/sample1_lane1_group1_r1.fq.gz         path/to/run_dir
+sample2 1     group1  path/to/run_dir/sample2_lane1_group1_r1.fq.gz         path/to/run_dir
+sample3 2     group2  path/to/run_dir/sample3_lane2_group2_r1.fq.gz         path/to/run_dir
+sample4 2     group3  path/to/run_dir/sample4_lane2_group3_r1.fq.gz         path/to/run_dir
+
 ```
 
 | Column    | Description                                                                                                                                                                            |
 | --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `lane`    | Lane where the sample was processed on an Illumina instrument (optional).                                                                                                              |
+| `group`   | Group the sample belongs too, useful when several groups are pooled together (optional).                                                                                               |
 | `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz" (optional).                                                  |
+| `rundir`  | Path to the runfolder containing extra information about the sequencing run (optional) .                                                                                               |
 
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+Another [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
 ## Running the pipeline
 

diff --git a/main.nf b/main.nf
@@ -58,7 +58,10 @@ workflow NFCORE_SEQINSPECTOR {
     )
 
     emit:
-    multiqc_report = SEQINSPECTOR.out.multiqc_report // channel: /path/to/multiqc_report.html
+    global_report = SEQINSPECTOR.out.global_report  // channel: /path/to/multiqc_report.html
+    lane_reports  = SEQINSPECTOR.out.lane_reports   // channel: /path/to/multiqc_report.html
+    group_reports = SEQINSPECTOR.out.group_reports  // channel: /path/to/multiqc_report.html
+    rundir_report = SEQINSPECTOR.out.rundir_reports // channel: /path/to/multiqc_report.html
 
 }
 /*
@@ -101,7 +104,7 @@ workflow {
         params.outdir,
         params.monochrome_logs,
         params.hook_url,
-        NFCORE_SEQINSPECTOR.out.multiqc_report
+        NFCORE_SEQINSPECTOR.out.global_report,
     )
 }
 

diff --git a/nf-test.config b/nf-test.config
@@ -0,0 +1,8 @@
+config {
+
+    testsDir "tests"
+    workDir ".nf-test"
+    configFile "tests/nextflow.config"
+    profile "test,docker"
+
+}
diff --git a/subworkflows/local/utils_nfcore_seqinspector_pipeline/main.nf b/subworkflows/local/utils_nfcore_seqinspector_pipeline/main.nf
@@ -84,7 +84,7 @@ workflow PIPELINE_INITIALISATION {
         .fromSamplesheet("input") // Validates samplesheet against $projectDir/assets/schema_input.json. Path to validation schema is defined by $projectDir/nextflow_schema.json
         .map {
             meta, fastq_1, fastq_2 ->
-                def id_string = "${meta.sample}_${meta.project ?: "ungrouped"}_${meta.lane}"
+                def id_string = "${meta.sample}_${meta.group ?: "ungrouped"}_${meta.lane}"
                 def updated_meta = meta + [ id: id_string ]
                 if (!fastq_2) {
                     return [ updated_meta.id, updated_meta + [ single_end:true ], [ fastq_1 ] ]
@@ -101,7 +101,6 @@ workflow PIPELINE_INITIALISATION {
         //     meta, fastqs ->
         //         return [ meta, fastqs.flatten() ]
         // }
-        .view()
         .set { ch_samplesheet }
 
     emit:

diff --git a/tests/MiSeq.main.nf.test b/tests/MiSeq.main.nf.test
@@ -0,0 +1,40 @@
+nextflow_pipeline {
+
+    name "Test Workflow main.nf on MiSeq data"
+    script "../main.nf"
+    tag "seqinspector"
+    tag "PIPELINE"
+
+    test("MiSeq data test") {
+
+        when {
+            config "./MiSeq.main.nf.test.config"
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success },
+                { assert snapshot(
+                    path("$outputDir/multiqc/lanes/L1/multiqc_data/multiqc_citations.txt"),
+                    path("$outputDir/multiqc/lanes/L1/multiqc_data/multiqc_fastqc.txt"),
+                    path("$outputDir/multiqc/lanes/L1/multiqc_data/multiqc_general_stats.txt"),
+                    path("$outputDir/multiqc/lanes/L1/multiqc_data/multiqc_software_versions.txt"),
+
+                    path("$outputDir/multiqc/groups/P001/multiqc_data/multiqc_citations.txt"),
+                    path("$outputDir/multiqc/groups/P001/multiqc_data/multiqc_fastqc.txt"),
+                    path("$outputDir/multiqc/groups/P001/multiqc_data/multiqc_general_stats.txt"),
+                    path("$outputDir/multiqc/groups/P001/multiqc_data/multiqc_software_versions.txt"),
+
+                    path("$outputDir/multiqc/multiqc_data/multiqc_citations.txt"),
+                    path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
+                    path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
+                    path("$outputDir/multiqc/multiqc_data/multiqc_software_versions.txt"),
+                    ).match()
+                }
+            )
+        }
+    }
+}
diff --git a/tests/MiSeq.main.nf.test.config b/tests/MiSeq.main.nf.test.config
@@ -0,0 +1,7 @@
+// Load the basic test config
+includeConfig 'nextflow.config'
+
+// Load the correct samplesheet for that test
+params {
+    input  = params.pipelines_testdata_base_path + 'seqinspector/testdata/MiSeq/samplesheet.csv'
+}
diff --git a/tests/MiSeq.main.nf.test.snap b/tests/MiSeq.main.nf.test.snap
@@ -0,0 +1,19 @@
+{
+    "MiSeq data test": {
+        "content": [
+            "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f",
+            "multiqc_fastqc.txt:md5,692b8aed0614ed1655f2c1cbea1ba312",
+            "multiqc_general_stats.txt:md5,630167d67d3f92408cd1a04422c7196f",
+            "multiqc_software_versions.txt:md5,7452f1f7aae2a8a4066c2ef6cd5ceb95",
+            "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f",
+            "multiqc_fastqc.txt:md5,692b8aed0614ed1655f2c1cbea1ba312",
+            "multiqc_general_stats.txt:md5,630167d67d3f92408cd1a04422c7196f",
+            "multiqc_software_versions.txt:md5,7452f1f7aae2a8a4066c2ef6cd5ceb95",
+            "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f",
+            "multiqc_fastqc.txt:md5,692b8aed0614ed1655f2c1cbea1ba312",
+            "multiqc_general_stats.txt:md5,630167d67d3f92408cd1a04422c7196f",
+            "multiqc_software_versions.txt:md5,7452f1f7aae2a8a4066c2ef6cd5ceb95"
+        ],
+        "timestamp": "2024-05-30T13:14:20.263485"
+    }
+}