nf-core · mirpedrol · Jun 28, 2023 · Mar 15, 2023 · Mar 15, 2023 · Mar 17, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -61,3 +61,26 @@ jobs:
       - name: Run pipeline with test data - screening
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_screening,docker --outdir ./results_screening
+
+  umis:
+    name: Run pipeline with UMI clustering test data
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/crisprseq') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "22.10.1"
+          - "latest-everything"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v3
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Run pipeline with UMI clustering test data
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_umis,docker --outdir ./result
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add UMI clustering to crisprseq-edition ([#24](https://github.com/nf-core/crisprseq/pull/24))
+
 ### Fixed
 
 - Fix warning "module used more than once" ([#25](https://github.com/nf-core/crisprseq/pull/25))

diff --git a/README.md b/README.md
@@ -32,11 +32,19 @@ For crispr targeted :
 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 3. Adapter trimming ([`Cutadapt`](http://dx.doi.org/10.14806/ej.17.1.200))
 4. Quality filtering ([`Seqtk`](https://github.com/lh3/seqtk))
-5. Read mapping:
+5. UMI clustering (optional):
+  1. Extract UMI sequences (Python script)
+  2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch))
+  3. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch))
+  4. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2))
+  5. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon))
+  6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`)
+  7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html))
+6. Read mapping:
    - ([`minimap2`](https://github.com/lh3/minimap2), _default_)
    - ([`bwa`](http://bio-bwa.sourceforge.net/))
    - ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml))
-6. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/))
+7. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/))
 
 For crispr screening :
 
@@ -56,7 +64,9 @@ For crispr screening :
 3. Download the pipeline and test it on a minimal dataset with a single command:
 
    ```bash
-   nextflow run nf-core/crisprseq -profile test,YOURPROFILE --outdir <OUTDIR>
+   nextflow run nf-core/crisprseq -profile test_screening,YOURPROFILE --outdir <OUTDIR>
+   # or
+   nextflow run nf-core/crisprseq -profile test_targeted,YOURPROFILE --outdir <OUTDIR>
    ```
 
    Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
@@ -69,7 +79,7 @@ For crispr screening :
 4. Start running your own analysis!
 
    ```bash
-   nextflow run nf-core/crisprseq --input samplesheet.csv --outdir <OUTDIR> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+   nextflow run nf-core/crisprseq --input samplesheet.csv --analysis <targeted/screening> --outdir <OUTDIR> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
    ```
 
 ## Documentation

diff --git a/bin/extract_umis.py b/bin/extract_umis.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
-# FROM: pipeline-umi-amplicon distributed by ONT https://github.com/nanoporetech/pipeline-umi-amplicon
-# https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py
+#
+# This code is obtained from Oxford Nanopore Technologies pipeline-umi-amplicon
+# Distributed under the Mozilla Public License, v. 2.0
+#
+# Original source code: https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py
+#
 
 import argparse
 import logging

diff --git a/conf/modules.config b/conf/modules.config
@@ -139,6 +139,18 @@ process {
         ext.args = '--max-error 3 --adapter-length 250 --fwd-context ""'
     }
 
+    withName: VSEARCH_CLUSTER {
+        ext.args = "--minseqlength ${params.vsearch_minseqlength} --maxseqlength ${params.vsearch_maxseqlength} --qmask none --clusterout_sort --gapopen 0E/5I --gapext 0E/2I --mismatch -8 --match 6 --iddef 0 --minwordmatches 0 --qmask none -id ${params.vsearch_id}"
+        ext.args2 = '--cluster_fast'
+        ext.args3 = '--clusters'
+    }
+
+
+    withName: VSEARCH_SORT {
+        ext.args = '--topn 1'
+        ext.prefix = { "${fasta.baseName}_top" }
+    }
+
     withName: MERGING_SUMMARY {
         publishDir = [
             path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
@@ -147,7 +159,43 @@ process {
         ]
     }
 
-    withName: DUMMY_FINAL_UMI {
+    withName: MINIMAP2_ALIGN_UMI_1 {
+        ext.args = '-x map-ont'
+        ext.prefix = { "${reads.baseName}_cycle1" }
+        publishDir = [
+            path: { "${params.outdir}/minimap2_umi" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: MINIMAP2_ALIGN_UMI_2 {
+        ext.args = '-x map-ont'
+        ext.prefix = { "${reads.baseName}_cycle2" }
+        publishDir = [
+            path: { "${params.outdir}/minimap2_umi" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: RACON_1 {
+        ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming'
+        ext.prefix = { "${reads.baseName}_cycle1" }
+    }
+
+    withName: RACON_2 {
+        ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming'
+        ext.prefix = { "${reads.baseName}_cycle2" }
+    }
+
+    withName: MEDAKA {
+        ext.args = '-m r941_min_high_g303'
+        ext.prefix = { "${reads.baseName}_medakaConsensus" }
+    }
+
+    withName: SEQTK_SEQ_FATOFQ {
+        ext.args = '-F "#"'
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
             mode: params.publish_dir_mode,

diff --git a/conf/test_umis.config b/conf/test_umis.config
@@ -0,0 +1,28 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests (with UMIs option)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/crisprseq -profile test,<conda/docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile UMIs'
+    config_profile_description = 'Minimal test dataset to check pipeline function with UMIs option'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input     = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv'
+    analysis = 'targeted'
+
+    // Aligner
+    aligner = 'minimap2'
+}
diff --git a/docs/output_targeted.md b/docs/output_targeted.md
@@ -11,14 +11,18 @@ The directories listed below will be created in the results directory after the
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
 - [Preprocessing](#preprocessing)
-  - [Sequences](#sequences) - Input sequence preparation (reference, protospacer, template)
+  - [sequences](#sequences) - Input sequence preparation (reference, protospacer, template)
   - [cat](#cat) - Concatenate sample fastq files if required
-  - [Pear](#pear) - Join double-end reads if required
-  - [FastQC](#fastqc) - Read Quality Control
-  - [Adapters](#adapters) - Find adapters (Overrepresented sequences) in reads
-  - [Cutadapt](#cutadapt) - Trim adapters
-  - [Seqtk](#seqtk) - Mask low-quality bases
-  <!-- -UMI(#umi) -->
+  - [pear](#pear) - Join double-end reads if required
+  - [fastqc](#fastqc) - Read Quality Control
+  - [adapters](#adapters) - Find adapters (Overrepresented sequences) in reads
+  - [cutadapt](#cutadapt) - Trim adapters
+  - [seqtk](#seqtk) - Mask low-quality bases
+- [UMI clustering](#umi-clustering)
+  - [vsearch](#vsearch)
+  - [minimap2 umi](#minimap2-umi)
+  - [racon](#racon)
+  - [medaka](#medaka)
 - [Mapping](#mapping)
   - [minimap2](#minimap2) - Mapping reads to reference
   - [BWA](#bwa) - Mapping reads to reference
@@ -133,7 +137,56 @@ If multiple libraries/runs have been provided for the same sample in the input s
 
 [Seqtk](https://github.com/lh3/seqtk) masks (converts to Ns) bases with quality lower than 20 and removes sequences shorter than 80 bases.
 
-<!-- ### UMI -->
+## UMI clustering
+
+### vsearch
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `vsearch/`
+  - `*_clusters*`: Contains all UMI sequences which clustered together.
+  - `*_clusters*_top.fasta`: Contains the most abundant UMI sequence from the cluster.
+
+</details>
+
+[VSEARCH](https://github.com/torognes/vsearch) is a versatile open-source tool which includes chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. `vsearch/clsuter` can cluster sequences using a single-pass, greedy centroid-based clustering algorithm. `vsearch/sort` can sort fasta entries by decreasing abundance (`--sortbysize`) or sequence length (`--sortbylength`).
+
+### minimap2_umi
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `minimap2_umi/`
+  - `*_sequences_clycle[1,2].paf`: Alignment of the cluster sequences against the top UMi sequence in paf format.
+
+</details>
+
+[Minimap2](https://github.com/lh3/minimap2) is a sequence alignment program that aligns DNA sequences against a reference database.
+
+### racon
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `racon/`
+  - `*_sequences_clycle[1,2]_assembly_consensus.fasta.gz`: Consensus sequence obtained from the cluster multiple sequence alignment.
+
+</details>
+
+[Racon](https://github.com/lbcb-sci/racon) is an ultrafast consensus module for raw de novo genome assembly of long uncorrected reads.
+
+### medaka
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `medaka/`
+  - `*_medakaConsensus.fasta`: Final consensus sequence of each UMI cluster. Obtained after two rounds of minimap2 + racon.
+
+</details>
+
+[Medaka](https://nanoporetech.github.io/medaka/index.html) is a tool to create consensus sequences and variant calls from nanopore sequencing data.
 
 ## Mapping
 

diff --git a/docs/usage_targeted.md b/docs/usage_targeted.md
@@ -53,10 +53,10 @@ An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv)
 
 ## Running the pipeline
 
-The typical command for running the pipeline is as follows:
+The typical command for running the pipeline for targeted CIRSPR analysis is as follows:
 
 ```bash
-nextflow run nf-core/crisprseq --input samplesheet.csv --outdir <OUTDIR> -profile docker
+nextflow run nf-core/crisprseq --input samplesheet.csv --analysis targeted --outdir <OUTDIR> -profile docker
 ```
 
 This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.

diff --git a/modules.json b/modules.json
@@ -52,6 +52,12 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "medaka": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/medaka/medaka.diff"
+                    },
                     "mageck/count": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
@@ -76,6 +82,11 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/minimap2/align/minimap2-align.diff"
                     },
+                    "minimap2/index": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba",
@@ -86,6 +97,16 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "racon": {
+                        "branch": "master",
+                        "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
+                        "installed_by": ["modules"]
+                    },
+                    "samtools/faidx": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/index": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
@@ -95,6 +116,17 @@
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
+                    },
+                    "vsearch/cluster": {
+                        "branch": "master",
+                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/vsearch/cluster/vsearch-cluster.diff"
+                    },
+                    "vsearch/sort": {
+                        "branch": "master",
+                        "git_sha": "e7801603532df26b4bb4ef324ca2c39f7a4d0eee",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/modules/local/dummy_final_umi.nf b/modules/local/dummy_final_umi.nf