Merge branch 'dev' of https://github.com/nf-core/crisprseq into umi-c…

…lustering
nf-core · Jun 28, 2023 · 2227d89 · 2227d89
2 parents 7f9bfda + c96bec9
commit 2227d89
Show file tree

Hide file tree

Showing 13 changed files with 158 additions and 54 deletions.
diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
@@ -1,4 +1,4 @@
-name: nf-core AWS full size tests
+name: nf-core AWS full size tests targeted
 # This workflow is triggered on published releases.
 # It can be additionally triggered manually with GitHub actions workflow dispatch button.
 # It runs the -profile 'test_full' on AWS batch
@@ -19,34 +19,13 @@ jobs:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
           compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
-          workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }}
+          workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }}/targeted_test
           parameters: |
             {
-              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-${{ github.sha }}"
+              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-${{ github.sha }}/targeted_test"
             }
           profiles: test_full,aws_tower
       - uses: actions/upload-artifact@v3
         with:
           name: Tower debug log file
           path: tower_action_*.log
-  run-tower-screening:
-    name: Run AWS full tests screening
-    if: github.repository == 'nf-core/crisprseq'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Launch workflow via tower
-        uses: nf-core/tower-action@v3
-        with:
-          workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
-          access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
-          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
-          workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }}
-          parameters: |
-            {
-              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-${{ github.sha }}"
-            }
-          profiles: test_screening_full,aws_tower
-      - uses: actions/upload-artifact@v3
-        with:
-          name: Tower debug log file
-          path: tower_action_*.log
diff --git a/.github/workflows/awsfulltest_screening.yml b/.github/workflows/awsfulltest_screening.yml
@@ -0,0 +1,31 @@
+name: nf-core AWS full size tests screening
+# This workflow is triggered on published releases.
+# It can be additionally triggered manually with GitHub actions workflow dispatch button.
+# It runs the -profile 'test_full' on AWS batch
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+jobs:
+  run-tower:
+    name: Run AWS full tests
+    if: github.repository == 'nf-core/crisprseq'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Launch workflow via tower
+        uses: nf-core/tower-action@v3
+        with:
+          workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+          access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }}/screening_test
+          parameters: |
+            {
+              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-${{ github.sha }}/screening_test"
+            }
+          profiles: test_screening_full,aws_tower
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: tower_action_*.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add new parameters --reference and --protospacer ([#45](https://github.com/nf-core/crisprseq/pull/45))
 - Add UMI clustering to crisprseq-targeted ([#24](https://github.com/nf-core/crisprseq/pull/24))
 
 ### Fixed

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -53,6 +53,14 @@
 
   > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506.
 
+- [MAGeCK](https://pubmed.ncbi.nlm.nih.gov/25476604/)
+
+  > Li W, Xu H, Xiao T, Cong L, Love MI, Zhang F, Irizarry RA, Liu JS, Brown M, Liu XS. MAGeCK enables robust identification of essential genes from genome-scale CRISPR/Cas9 knockout screens. Genome Biol. 2014;15(12):554.
+
+- [CRISPRcleanR](https://pubmed.ncbi.nlm.nih.gov/30103702/)
+
+  > Li, W. et al. Quality control, modeling, and visualization of CRISPR screens with MAGeCK-VISPR. Genome Biology 16, 281, doi:10.1186/s13059-015-0843-6 (2015).
+
 - [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/)
 
   > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671.

diff --git a/conf/modules.config b/conf/modules.config
@@ -43,6 +43,7 @@ process {
     }
 
     withName: ORIENT_REFERENCE {
+        ext.prefix = { params.reference_fasta ? "${reference.baseName}" : "${meta.id}_reference" }
         publishDir = [
             path: { "${params.outdir}/preprocessing/sequences" },
             mode: params.publish_dir_mode,

diff --git a/docs/usage/targeted.md b/docs/usage/targeted.md
@@ -75,6 +75,54 @@ If the provided samples were sequenced using umi-molecular identifiers (UMIs), u
 6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`)
 7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html))
 
+## Other input parameters
+
+### Reference
+
+If you want to provide the same reference for every sample, you can select a genome with `--genome` or provide a reference FASTA file with `--reference_fasta`.
+Using any of these two parameters will override any reference sequence provided through an input sample sheet.
+
+Please refer to the [nf-core website](https://nf-co.re/usage/reference_genomes) for general usage docs and guidelines regarding reference genomes.
+
+### Protospacer
+
+If you want to provide the same protospacer sequence for every sample, you can provide the sequence with the parameter `--protospacer`.
+Using this parameter will override any protospacer sequence provided through an input sample sheet.
+
+Providing a protospacer, either through a sample sheet or by using the parameter `--protospacer` is requeired.
+
+## Alignment options
+
+By default, the pipeline uses `minimap2` (i.e. `--aligner minimap2`) to map the sequenced FASTQ reads to the reference.
+You also have the option to select other alignment tools by suing the parameter `--alignment`. Possible options are `minimap2`, `bwa` or `bowtie2`.
+
+The default alignment with `minimap2` uses adapted parameters which were seen to improve the alignment and reduce potential sequencing or alignment errors.
+The default parameters are:
+
+- A matching score of 29
+- A mismatching penalty of 17
+- A gap open penalty of 25
+- A gap extension penalty of 2.
+
+Please refer to the original [CRISPR-Analytics](https://doi.org/10.1371/journal.pcbi.1011137) publication to see the benchmarking of such parameters.
+
+In order to customise such parameters, you can override the arguments given to `minimap2` by creating a configuration file and provide it to your nextflow run with `-c`:
+
+```groovy
+// Custom config file custom.config
+process {
+    withName: MINIMAP2_ALIGN_ORIGINAL {
+        ext.args = '-A 29 -B 17 -O 25 -E 2'
+    }
+}
+```
+
+Command:
+
+```bash
+nextflow run nf-core/crisprseq --input samplesheet.csv --analysis targeted --outdir <OUTDIR> -profile docker -c custom.config
+```
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:

diff --git a/main.nf b/main.nf
@@ -18,7 +18,7 @@ nextflow.enable.dsl = 2
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')
+params.reference_fasta = params.reference_fasta ?: WorkflowMain.getGenomeAttribute(params, 'fasta')
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/modules/local/orient_reference.nf b/modules/local/orient_reference.nf
@@ -11,16 +11,16 @@ process ORIENT_REFERENCE {
     tuple val(meta), file(reference), val(protospacer)
 
     output:
-    tuple val(meta), path('*_reference-correctOrient.fasta') , emit: reference
-    path "versions.yml"                                      , emit: versions
+    tuple val(meta), path('*-correctOrient.fasta') , emit: reference
+    path "versions.yml"                            , emit: versions
 
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
     revComp_reference.R \\
         $reference \\
-        ${meta.id}_reference-correctOrient.fasta \\
+        ${prefix}-correctOrient.fasta \\
         $protospacer;
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/nextflow.config b/nextflow.config
@@ -13,6 +13,7 @@ params {
     input                      = null
     analysis                   = null
     aligner                    = 'minimap2'
+    protospacer                = null
     library                    = null
     crisprcleanr               = null
     rra_contrasts              = null
@@ -38,6 +39,7 @@ params {
     genome                     = null
     igenomes_base              = 's3://ngi-igenomes/igenomes'
     igenomes_ignore            = false
+    reference_fasta            = null
 
     // MultiQC options
     multiqc_config             = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -88,8 +88,8 @@
             },
             "fa_icon": "fas fa-layer-group"
         },
-        "alignment_parameters": {
-            "title": "Alignment parameters",
+        "targeted_parameters": {
+            "title": "Targeted parameters",
             "type": "object",
             "description": "Parameters used for alignment processes",
             "default": "",
@@ -100,6 +100,11 @@
                     "default": "minimap2",
                     "fa_icon": "fas fa-align-justify",
                     "enum": ["minimap2", "bwa", "bowtie2"]
+                },
+                "protospacer": {
+                    "type": "string",
+                    "fa_icon": "fas fa-grip-lines",
+                    "description": "Provide the same protospacer sequence for all samples. Will override protospacer sequences provided by an input samplesheet."
                 }
             },
             "fa_icon": "fas fa-align-justify"
@@ -189,14 +194,12 @@
                     "fa_icon": "fas fa-book",
                     "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
                 },
-                "fasta": {
+                "reference_fasta": {
                     "type": "string",
-                    "format": "file-path",
-                    "mimetype": "text/plain",
                     "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$",
-                    "description": "Path to FASTA genome file.",
-                    "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.",
-                    "fa_icon": "far fa-file-code"
+                    "description": "Path to the reference FASTA file. Will override reference sequences provided by an input sample sheet.",
+                    "fa_icon": "far fa-file-alt",
+                    "format": "file-path"
                 },
                 "igenomes_base": {
                     "type": "string",
@@ -420,7 +423,16 @@
             "$ref": "#/definitions/umi_parameters"
         },
         {
-            "$ref": "#/definitions/alignment_parameters"
+            "$ref": "#/definitions/targeted_pipeline_steps"
+        },
+        {
+            "$ref": "#/definitions/umi_parameters"
+        },
+        {
+            "$ref": "#/definitions/targeted_parameters"
+        },
+        {
+            "$ref": "#/definitions/vsearch_parameters"
         },
         {
             "$ref": "#/definitions/vsearch_parameters"

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -120,7 +120,7 @@ def create_protospacer_channel(LinkedHashMap row) {
 
     // add protospacer sequence to meta
     def protospacer_meta = []
-    if (row.protospacer.length() <= 0) {
+    if (row.protospacer.length() <= 0 && !params.protospacer) {
         exit 1, "ERROR: Please check input samplesheet -> Protospacer sequence is not provided!\n"
     } else {
         protospacer_meta = [ meta, row.protospacer ]

diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf
@@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
 WorkflowCrisprseq.initialise(params, log)
 
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.multiqc_config, params.fasta, params.library, params.mle_design_matrix ]
+def checkPathParamList = [ params.multiqc_config, params.reference_fasta, params.library, params.mle_design_matrix ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters

diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
@@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
 WorkflowCrisprseq.initialise(params, log)
 
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config ]
+def checkPathParamList = [ params.input, params.multiqc_config, params.reference_fasta ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
@@ -162,12 +162,6 @@ workflow CRISPRSEQ_TARGETED {
     .set { ch_fastq }
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
-    INPUT_CHECK.out.reference
-    .map {
-        meta, fastq ->
-            [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ]
-    }
-
     //
     // MODULE: Add reference sequences to file
     //
@@ -196,14 +190,42 @@ workflow CRISPRSEQ_TARGETED {
 
     // Join channels with reference and protospacer
     // to channel: [ meta, reference, protospacer]
-    SEQ_TO_FILE_REF.out.file
-        .join(INPUT_CHECK.out.protospacer
-            .map {
-                meta, fastq ->
-                    [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ]
-            },
-            by: 0)
-        .set{ reference_protospacer }
+    if (!params.reference_fasta && !params.protospacer) {
+        SEQ_TO_FILE_REF.out.file
+            .join(INPUT_CHECK.out.protospacer
+                .map {
+                    meta, fastq ->
+                        [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ]
+                },
+                by: 0)
+            .set{ reference_protospacer }
+    } else if (!params.reference_fasta) {
+        // If a protospacer was provided through the --protospacer param instead of the samplesheet
+        ch_protospacer = Channel.of(params.protospacer)
+        SEQ_TO_FILE_REF.out.file
+            .combine(ch_protospacer)
+            .set{ reference_protospacer }
+    } else if (!params.protospacer) {
+        // If a reference was provided through a fasta file or igenomes instead of the samplesheet
+        ch_reference = Channel.fromPath(params.reference_fasta)
+        INPUT_CHECK.out.protospacer
+            .combine(ch_reference)
+            .map{ meta, protospacer, reference ->
+                [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], reference, protospacer ]
+            }
+            .set{ reference_protospacer }
+    } else {
+        ch_reference = Channel.fromPath(params.reference_fasta)
+        ch_protospacer = Channel.of(params.protospacer)
+        INPUT_CHECK.out.reads
+            .combine(ch_reference)
+            .combine(ch_protospacer)
+            .map{ meta, reads, reference, protospacer ->
+                [meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], reference, protospacer]
+            }
+            .set{ reference_protospacer }
+    }
+
 
     //
     // MODULE: Prepare reference sequence