From 760fa55487852d887a4140ca9c52b45bb8c36d54 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Wed, 15 Mar 2023 09:54:26 +0100
Subject: [PATCH 01/45] add extract umi step

---
 bin/extract_umis.py    | 8 ++++++--
 workflows/crisprseq.nf | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/bin/extract_umis.py b/bin/extract_umis.py
index 78802c9f..e4792d0f 100755
--- a/bin/extract_umis.py
+++ b/bin/extract_umis.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
-# FROM: pipeline-umi-amplicon distributed by ONT https://github.com/nanoporetech/pipeline-umi-amplicon
-# https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py
+#
+# This code is obtained from Oxford Nanopore Technologies pipeline-umi-amplicon
+# Distributed under the Mozilla Public License, v. 2.0
+#
+# Original source code: https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py
+#
 
 import argparse
 import logging
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 11174d93..fe06f95f 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -269,8 +269,6 @@ workflow CRISPRSEQ {
     }
 
 
-    /*
-    The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 
     //
     // MODULE: Extract UMI sequences
@@ -279,6 +277,10 @@ workflow CRISPRSEQ {
         SEQTK_SEQ.out.fastx
     )
 
+    /*
+    The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
+
+
     Modules to implement:
 
     vsearch

From 21cda6b85a715ce492d6b6b9702ac1d1a38f9252 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Wed, 15 Mar 2023 10:33:06 +0100
Subject: [PATCH 02/45] add test_umis.config

---
 conf/test_umis.config  | 27 +++++++++++++++++++++++++++
 nextflow.config        |  1 +
 workflows/crisprseq.nf |  2 ++
 3 files changed, 30 insertions(+)
 create mode 100644 conf/test_umis.config

diff --git a/conf/test_umis.config b/conf/test_umis.config
new file mode 100644
index 00000000..42eab8f8
--- /dev/null
+++ b/conf/test_umis.config
@@ -0,0 +1,27 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests (with UMIs option)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/crisprseq -profile test,<conda/docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile UMIs'
+    config_profile_description = 'Minimal test dataset to check pipeline function with UMIs option'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv'
+
+    // Aligner
+    aligner = 'minimap2'
+}
diff --git a/nextflow.config b/nextflow.config
index da037ee2..e415ebe9 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -144,6 +144,7 @@ profiles {
     }
     test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
+    test_umis { includeConfig 'conf/test_umis.config' }
 }
 
 
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index fe06f95f..439d60d5 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -277,6 +277,8 @@ workflow CRISPRSEQ {
         SEQTK_SEQ.out.fastx
     )
 
+    EXTRACT_UMIS.out.fasta.view()
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 

From 373b8095499dcd09a6b40b965d7397843cb881da Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 17 Mar 2023 11:42:18 +0100
Subject: [PATCH 03/45] fix extract_umis

---
 modules/local/extract_umis.nf | 6 +++---
 workflows/crisprseq.nf        | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/local/extract_umis.nf b/modules/local/extract_umis.nf
index 4f9be2a3..d6c8fb44 100644
--- a/modules/local/extract_umis.nf
+++ b/modules/local/extract_umis.nf
@@ -11,9 +11,9 @@ process EXTRACT_UMIS {
     tuple val(meta), path(reads)
 
     output:
-    tuple val(meta), path(fasta) , emit: fasta
-    tuple val(meta), path(tsv)   , emit: tsv
-    path "versions.yml"          , emit: versions
+    tuple val(meta), path("*.fasta") , emit: fasta
+    tuple val(meta), path("*.tsv")   , emit: tsv
+    path "versions.yml"              , emit: versions
 
     script:
     def args = task.ext.args ?: ''
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 439d60d5..48ceab53 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -269,7 +269,6 @@ workflow CRISPRSEQ {
     }
 
 
-
     //
     // MODULE: Extract UMI sequences
     //

From ab49b680eaafef5b36918e26e3d663c02a743d45 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 17 Mar 2023 13:08:59 +0100
Subject: [PATCH 04/45] install vsearch

---
 modules.json                             |  5 ++
 modules/nf-core/vsearch/cluster/main.nf  | 73 ++++++++++++++++++++++++
 modules/nf-core/vsearch/cluster/meta.yml | 68 ++++++++++++++++++++++
 workflows/crisprseq.nf                   |  1 +
 4 files changed, 147 insertions(+)
 create mode 100644 modules/nf-core/vsearch/cluster/main.nf
 create mode 100644 modules/nf-core/vsearch/cluster/meta.yml

diff --git a/modules.json b/modules.json
index 175f4522..350f06ae 100644
--- a/modules.json
+++ b/modules.json
@@ -71,6 +71,11 @@
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
+                    },
+                    "vsearch/cluster": {
+                        "branch": "master",
+                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "installed_by": ["modules"]
                     }
                 }
             },
diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf
new file mode 100644
index 00000000..7bcda8ac
--- /dev/null
+++ b/modules/nf-core/vsearch/cluster/main.nf
@@ -0,0 +1,73 @@
+process VSEARCH_CLUSTER {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::vsearch=2.21.1 bioconda::samtools=1.16.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0':
+        'quay.io/biocontainers/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+
+    output:
+    tuple val(meta), path('*.aln.gz')                , optional: true, emit: aln
+    tuple val(meta), path('*.biom.gz')               , optional: true, emit: biom
+    tuple val(meta), path('*.mothur.tsv.gz')         , optional: true, emit: mothur
+    tuple val(meta), path('*.otu.tsv.gz')            , optional: true, emit: otu
+    tuple val(meta), path('*.bam')                   , optional: true, emit: bam
+    tuple val(meta), path('*.out.tsv.gz')            , optional: true, emit: out
+    tuple val(meta), path('*.blast.tsv.gz')          , optional: true, emit: blast
+    tuple val(meta), path('*.uc.tsv.gz')             , optional: true, emit: uc
+    tuple val(meta), path('*.centroids.fasta.gz')    , optional: true, emit: centroids
+    tuple val(meta), path('*.clusters.fasta.gz')     , optional: true, emit: clusters
+    tuple val(meta), path('*.profile.txt.gz')        , optional: true, emit: profile
+    tuple val(meta), path('*.msa.fasta.gz')          , optional: true, emit: msa
+    path "versions.yml"                              , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def args3 = task.ext.args3 ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) {
+            error "Unknown clustering option provided (${args2})"
+        }
+    def out_ext = args3.contains("--alnout") ? "aln" :
+                    args3.contains("--biomout") ? "biom" :
+                    args3.contains("--blast6out") ? "blast.tsv" :
+                    args3.contains("--centroids") ? "centroids.fasta" :
+                    args3.contains("--clusters") ? "clusters.fasta" :
+                    args3.contains("--mothur_shared_out") ? "mothur.tsv" :
+                    args3.contains("--msaout") ? "msa.fasta" :
+                    args3.contains("--otutabout") ? "otu.tsv" :
+                    args3.contains("--profile") ? "profile.txt" :
+                    args3.contains("--samout") ? "sam" :
+                    args3.contains("--uc") ? "uc.tsv" :
+                    args3.contains("--userout") ? "out.tsv" :
+                    ""
+    if (out_ext == "") { error "Unknown output file format provided (${args3})" }
+    """
+    vsearch \\
+        $args2 $fasta \\
+        $args3 ${prefix}.${out_ext} \\
+        --threads $task.cpus \\
+        $args
+
+    if [[ $args3 != "--samout" ]]
+    then
+        gzip -n ${prefix}.${out_ext}
+    else
+        samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam
+    fi
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/vsearch/cluster/meta.yml b/modules/nf-core/vsearch/cluster/meta.yml
new file mode 100644
index 00000000..5cc620cb
--- /dev/null
+++ b/modules/nf-core/vsearch/cluster/meta.yml
@@ -0,0 +1,68 @@
+name: "vsearch_cluster"
+description: Cluster sequences using a single-pass, greedy centroid-based clustering algorithm.
+keywords:
+  - vsearch
+  - clustering
+tools:
+  - vsearch:
+      description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative)
+      homepage: https://github.com/torognes/vsearch
+      documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf
+      tool_dev_url: https://github.com/torognes/vsearch
+      doi: 10.7717/peerj.2584
+      licence: ["GPL v3-or-later OR BSD-2-clause"]
+
+input:
+  - meta:
+      type: map
+      description: Groovy Map containing sample information e.g. [ id:'test' ]
+  - fasta:
+      type: file
+      description: Sequences to cluster in FASTA format
+      pattern: "*.{fasta,fa,fasta.gz,fa.gz}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - aln:
+      type: file
+      description: Results in pairwise alignment format
+      pattern: "*.aln.gz"
+  - biom:
+      type: file
+      description: Results in an OTU table in the biom version 1.0 file format
+      pattern: "*.biom.gz"
+  - mothur:
+      type: file
+      description: Results in an OTU table in the mothur ’shared’ tab-separated plain text file format
+      pattern: "*.mothur.tsv.gz"
+  - otu:
+      type: file
+      description: Results in an OTU table in the classic tab-separated plain text format
+      pattern: "*.otu.tsv.gz"
+  - bam:
+      type: file
+      description: Results written in bam format
+      pattern: "*.bam"
+  - out:
+      type: file
+      description: Results in tab-separated output, columns defined by user
+      pattern: "*.out.tsv.gz"
+  - blast:
+      type: file
+      description: Tab delimited results in blast-like tabular format
+      pattern: "*.blast.tsv.gz"
+  - uc:
+      type: file
+      description: Tab delimited results in a uclust-like format with 10 columns
+      pattern: "*.uc.gz"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@mirpedrol"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 48ceab53..d7d0c55e 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -68,6 +68,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS                   } from '../modules/nf-co
 include { PEAR                                          } from '../modules/nf-core/pear/main'
 include { CAT_FASTQ                                     } from '../modules/nf-core/cat/fastq/main'
 include { SEQTK_SEQ                                     } from '../modules/nf-core/seqtk/seq/main'
+include { VSEARCH_CLUSTER                               } from '../modules/nf-core/vsearch/cluster/main'
 include { BOWTIE2_ALIGN                                 } from '../modules/nf-core/bowtie2/align/main'
 include { BOWTIE2_BUILD                                 } from '../modules/nf-core/bowtie2/build/main'
 include { BWA_MEM                                       } from '../modules/nf-core/bwa/mem/main'

From 1eb72b8888293b4fa6dac9b6ae2a70f6c7a21e6b Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 20 Mar 2023 09:19:26 +0100
Subject: [PATCH 05/45] add vsearch/cluster step

---
 conf/modules.config                           |  6 ++
 modules.json                                  |  3 +-
 modules/nf-core/vsearch/cluster/main.nf       | 29 +++-----
 .../vsearch/cluster/vsearch-cluster.diff      | 50 +++++++++++++
 nextflow.config                               |  4 ++
 nextflow_schema.json                          | 72 ++++++++++++++-----
 workflows/crisprseq.nf                        |  5 +-
 7 files changed, 132 insertions(+), 37 deletions(-)
 create mode 100644 modules/nf-core/vsearch/cluster/vsearch-cluster.diff

diff --git a/conf/modules.config b/conf/modules.config
index eb6eb831..f4a637a7 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -105,6 +105,12 @@ process {
         ext.args = '--max-error 3 --adapter-length 250 --fwd-context ""'
     }
 
+    withName: VSEARCH_CLUSTER {
+        ext.args = "--minseqlength ${params.vsearch_minseqlength} --maxseqlength ${params.vsearch_maxseqlength} --qmask none --clusterout_sort --gapopen 0E/5I --gapext 0E/2I --mismatch -8 --match 6 --iddef 0 --minwordmatches 0 --qmask none -id ${params.vsearch_id}"
+        ext.args2 = '--cluster_fast'
+        ext.args3 = '--clusters'
+    }
+
     withName: MERGING_SUMMARY {
         publishDir = [
             path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
diff --git a/modules.json b/modules.json
index 350f06ae..c741cfd1 100644
--- a/modules.json
+++ b/modules.json
@@ -75,7 +75,8 @@
                     "vsearch/cluster": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/vsearch/cluster/vsearch-cluster.diff"
                     }
                 }
             },
diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf
index 7bcda8ac..c450470b 100644
--- a/modules/nf-core/vsearch/cluster/main.nf
+++ b/modules/nf-core/vsearch/cluster/main.nf
@@ -37,16 +37,16 @@ process VSEARCH_CLUSTER {
     if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) {
             error "Unknown clustering option provided (${args2})"
         }
-    def out_ext = args3.contains("--alnout") ? "aln" :
-                    args3.contains("--biomout") ? "biom" :
-                    args3.contains("--blast6out") ? "blast.tsv" :
-                    args3.contains("--centroids") ? "centroids.fasta" :
-                    args3.contains("--clusters") ? "clusters.fasta" :
-                    args3.contains("--mothur_shared_out") ? "mothur.tsv" :
-                    args3.contains("--msaout") ? "msa.fasta" :
-                    args3.contains("--otutabout") ? "otu.tsv" :
-                    args3.contains("--profile") ? "profile.txt" :
-                    args3.contains("--samout") ? "sam" :
+    def out_ext = args3.contains("--alnout") ? ".aln" :
+                    args3.contains("--biomout") ? ".biom" :
+                    args3.contains("--blast6out") ? ".blast.tsv" :
+                    args3.contains("--centroids") ? ".centroids.fasta" :
+                    args3.contains("--clusters") ? "_clusters" :
+                    args3.contains("--mothur_shared_out") ? ".mothur.tsv" :
+                    args3.contains("--msaout") ? ".msa.fasta" :
+                    args3.contains("--otutabout") ? ".otu.tsv" :
+                    args3.contains("--profile") ? ".profile.txt" :
+                    args3.contains("--samout") ? ".sam" :
                     args3.contains("--uc") ? "uc.tsv" :
                     args3.contains("--userout") ? "out.tsv" :
                     ""
@@ -54,17 +54,10 @@ process VSEARCH_CLUSTER {
     """
     vsearch \\
         $args2 $fasta \\
-        $args3 ${prefix}.${out_ext} \\
+        $args3 ${prefix}${out_ext} \\
         --threads $task.cpus \\
         $args
 
-    if [[ $args3 != "--samout" ]]
-    then
-        gzip -n ${prefix}.${out_ext}
-    else
-        samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam
-    fi
-
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//')
diff --git a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff
new file mode 100644
index 00000000..13ad9aa5
--- /dev/null
+++ b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff
@@ -0,0 +1,50 @@
+Changes in module 'nf-core/vsearch/cluster'
+--- modules/nf-core/vsearch/cluster/main.nf
++++ modules/nf-core/vsearch/cluster/main.nf
+@@ -37,16 +37,16 @@
+     if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) {
+             error "Unknown clustering option provided (${args2})"
+         }
+-    def out_ext = args3.contains("--alnout") ? "aln" :
+-                    args3.contains("--biomout") ? "biom" :
+-                    args3.contains("--blast6out") ? "blast.tsv" :
+-                    args3.contains("--centroids") ? "centroids.fasta" :
+-                    args3.contains("--clusters") ? "clusters.fasta" :
+-                    args3.contains("--mothur_shared_out") ? "mothur.tsv" :
+-                    args3.contains("--msaout") ? "msa.fasta" :
+-                    args3.contains("--otutabout") ? "otu.tsv" :
+-                    args3.contains("--profile") ? "profile.txt" :
+-                    args3.contains("--samout") ? "sam" :
++    def out_ext = args3.contains("--alnout") ? ".aln" :
++                    args3.contains("--biomout") ? ".biom" :
++                    args3.contains("--blast6out") ? ".blast.tsv" :
++                    args3.contains("--centroids") ? ".centroids.fasta" :
++                    args3.contains("--clusters") ? "_clusters" :
++                    args3.contains("--mothur_shared_out") ? ".mothur.tsv" :
++                    args3.contains("--msaout") ? ".msa.fasta" :
++                    args3.contains("--otutabout") ? ".otu.tsv" :
++                    args3.contains("--profile") ? ".profile.txt" :
++                    args3.contains("--samout") ? ".sam" :
+                     args3.contains("--uc") ? "uc.tsv" :
+                     args3.contains("--userout") ? "out.tsv" :
+                     ""
+@@ -54,16 +54,9 @@
+     """
+     vsearch \\
+         $args2 $fasta \\
+-        $args3 ${prefix}.${out_ext} \\
++        $args3 ${prefix}${out_ext} \\
+         --threads $task.cpus \\
+         $args
+-
+-    if [[ $args3 != "--samout" ]]
+-    then
+-        gzip -n ${prefix}.${out_ext}
+-    else
+-        samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam
+-    fi
+ 
+     cat <<-END_VERSIONS > versions.yml
+     "${task.process}":
+
+************************************************************
diff --git a/nextflow.config b/nextflow.config
index e415ebe9..1aaf3c0e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,6 +13,10 @@ params {
     input                      = null
     aligner                    = 'minimap2'
 
+    // Vsearch options
+    vsearch_minseqlength       = 55
+    vsearch_maxseqlength       = 57
+    vsearch_id                 = 0.99
 
     // References
     genome                     = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 20ed8752..efad63df 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -5,22 +5,6 @@
     "description": "Pipeline for the analysis of crispr data",
     "type": "object",
     "definitions": {
-        "alignment_parameters": {
-            "title": "Alignment parameters",
-            "type": "object",
-            "description": "",
-            "default": "",
-            "properties": {
-                "aligner": {
-                    "type": "string",
-                    "description": "Aligner program to use",
-                    "default": "minimap2",
-                    "fa_icon": "fas fa-align-justify",
-                    "enum": ["minimap2", "bwa", "bowtie2"]
-                }
-            },
-            "required": ["aligner"]
-        },
         "input_output_options": {
             "title": "Input/output options",
             "type": "object",
@@ -58,6 +42,57 @@
                 }
             }
         },
+        "alignment_parameters": {
+            "title": "Alignment parameters",
+            "type": "object",
+            "description": "",
+            "default": "",
+            "properties": {
+                "aligner": {
+                    "type": "string",
+                    "description": "Aligner program to use.",
+                    "default": "minimap2",
+                    "fa_icon": "fas fa-align-justify",
+                    "enum": ["minimap2", "bwa", "bowtie2"]
+                }
+            },
+            "required": ["aligner"],
+            "fa_icon": "fas fa-align-justify"
+        },
+        "vsearch_parameters": {
+            "title": "Vsearch parameters",
+            "type": "object",
+            "description": "",
+            "default": "",
+            "properties": {
+                "vsearch_minseqlength": {
+                    "type": "integer",
+                    "default": 55,
+                    "fa_icon": "fas fa-minus",
+                    "description": "Vsearch minimum sequence length.",
+                    "help_text": "Discard sequences shorter than vsearch_minseqlength.",
+                    "minimum": 1
+                },
+                "vsearch_maxseqlength": {
+                    "type": "integer",
+                    "default": 57,
+                    "fa_icon": "fas fa-plus",
+                    "description": "Vsearch maximum sequence length.",
+                    "help_text": "Discard sequences longer than vsearch_minseqlength.",
+                    "minimum": 1
+                },
+                "vsearch_id": {
+                    "type": "number",
+                    "default": 0.99,
+                    "fa_icon": "fas fa-equals",
+                    "description": "Vsearch pairwise identity threshold.",
+                    "help_text": "Do not add the target to the cluster if the pairwise identity with the centroid is lower than id. The pairwise identity is defined as the number of (matching columns) / (alignment length - terminal gaps).",
+                    "minimum": 0,
+                    "maximum": 1
+                }
+            },
+            "fa_icon": "fas fa-layer-group"
+        },
         "reference_genome_options": {
             "title": "Reference genome options",
             "type": "object",
@@ -284,11 +319,14 @@
         }
     },
     "allOf": [
+        {
+            "$ref": "#/definitions/input_output_options"
+        },
         {
             "$ref": "#/definitions/alignment_parameters"
         },
         {
-            "$ref": "#/definitions/input_output_options"
+            "$ref": "#/definitions/vsearch_parameters"
         },
         {
             "$ref": "#/definitions/reference_genome_options"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index d7d0c55e..f60b937d 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -277,7 +277,10 @@ workflow CRISPRSEQ {
         SEQTK_SEQ.out.fastx
     )
 
-    EXTRACT_UMIS.out.fasta.view()
+
+    VSEARCH_CLUSTER (
+        EXTRACT_UMIS.out.fasta
+    )
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:

From 8467cd9c1303190bec94579372b491a1796e9cf9 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 20 Mar 2023 10:09:46 +0100
Subject: [PATCH 06/45] don't compress cluster fastas

---
 modules/nf-core/vsearch/cluster/main.nf              | 2 +-
 modules/nf-core/vsearch/cluster/vsearch-cluster.diff | 9 +++++++++
 workflows/crisprseq.nf                               | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf
index c450470b..413f70c9 100644
--- a/modules/nf-core/vsearch/cluster/main.nf
+++ b/modules/nf-core/vsearch/cluster/main.nf
@@ -20,7 +20,7 @@ process VSEARCH_CLUSTER {
     tuple val(meta), path('*.blast.tsv.gz')          , optional: true, emit: blast
     tuple val(meta), path('*.uc.tsv.gz')             , optional: true, emit: uc
     tuple val(meta), path('*.centroids.fasta.gz')    , optional: true, emit: centroids
-    tuple val(meta), path('*.clusters.fasta.gz')     , optional: true, emit: clusters
+    tuple val(meta), path('*_clusters*')             , optional: true, emit: clusters
     tuple val(meta), path('*.profile.txt.gz')        , optional: true, emit: profile
     tuple val(meta), path('*.msa.fasta.gz')          , optional: true, emit: msa
     path "versions.yml"                              , emit: versions
diff --git a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff
index 13ad9aa5..ff6808ac 100644
--- a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff
+++ b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff
@@ -1,6 +1,15 @@
 Changes in module 'nf-core/vsearch/cluster'
 --- modules/nf-core/vsearch/cluster/main.nf
 +++ modules/nf-core/vsearch/cluster/main.nf
+@@ -20,7 +20,7 @@
+     tuple val(meta), path('*.blast.tsv.gz')          , optional: true, emit: blast
+     tuple val(meta), path('*.uc.tsv.gz')             , optional: true, emit: uc
+     tuple val(meta), path('*.centroids.fasta.gz')    , optional: true, emit: centroids
+-    tuple val(meta), path('*.clusters.fasta.gz')     , optional: true, emit: clusters
++    tuple val(meta), path('*_clusters*')             , optional: true, emit: clusters
+     tuple val(meta), path('*.profile.txt.gz')        , optional: true, emit: profile
+     tuple val(meta), path('*.msa.fasta.gz')          , optional: true, emit: msa
+     path "versions.yml"                              , emit: versions
 @@ -37,16 +37,16 @@
      if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) {
              error "Unknown clustering option provided (${args2})"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index f60b937d..f5d7c796 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -282,6 +282,8 @@ workflow CRISPRSEQ {
         EXTRACT_UMIS.out.fasta
     )
 
+    VSEARCH_CLUSTER.out.clusters.transpose().view()
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 

From bc43a971837f0e2e9db45b7cb8794315b4948fb1 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 21 Mar 2023 10:48:32 +0100
Subject: [PATCH 07/45] add closure to obtain sequence from umi

---
 workflows/crisprseq.nf | 69 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index f5d7c796..48370acf 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -48,6 +48,7 @@ include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file'
 include { ORIENT_REFERENCE                 } from '../modules/local/orient_reference'
 include { CIGAR_PARSER                     } from '../modules/local/cigar_parser'
 include { MERGING_SUMMARY                  } from '../modules/local/merging_summary'
+include { UMI_TO_SEQUENCE                  } from '../modules/local/umi_to_sequence'
 include { CLUSTERING_SUMMARY               } from '../modules/local/clustering_summary'
 include { ALIGNMENT_SUMMARY                } from '../modules/local/alignment_summary'
 include { TEMPLATE_REFERENCE               } from '../modules/local/template_reference'
@@ -79,6 +80,32 @@ include { CUTADAPT                                      } from '../modules/nf-co
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
 
 
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    DEFINE GROOVY FUNCTIONS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+sufix = ""
+
+umi_to_sequence = { meta, cluster ->
+    cluster.withReader { source ->
+        output = file("${cluster.baseName}_${sufix}.fasta")
+        output.withWriter { target ->
+            String line
+            while ( line=source.readLine() ) {
+                if (line.startsWith(">")) {
+                    sequence = (line =~ /;seq=(.*$)/)[0][1]
+                    id = (line =~ /(>.*?);/)[0][1]
+                }
+                target << id + "\n" + sequence
+            }
+        }
+    }
+    return [meta, output]
+}
+
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -278,11 +305,49 @@ workflow CRISPRSEQ {
     )
 
 
+    //
+    // MODULE: Cluster UMIs
+    //
     VSEARCH_CLUSTER (
         EXTRACT_UMIS.out.fasta
     )
 
-    VSEARCH_CLUSTER.out.clusters.transpose().view()
+    //  Obtain a file with UBS (UBI bin size) and UMI ID
+    VSEARCH_CLUSTER.out.clusters
+    .transpose()
+    .collectFile( storeDir:params.outdir ) {
+        it ->
+            [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ]
+    }
+
+    // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences
+    VSEARCH_CLUSTER.out.clusters
+    .transpose()
+    .branch{
+        meta, cluster ->
+            single: cluster.countFasta() == 1
+                return [meta, cluster]
+            cluster: cluster.countFasta() > 1
+                return [meta, cluster]
+    }
+    .set{ ch_umi_bysize }
+
+    // Get the correspondent fasta sequencences from single clusters
+    sufix = "consensus"
+
+    ch_umi_bysize.single
+    .map(umi_to_sequence)
+    .set{ ch_cluster_sequence }
+
+
+    ch_cluster_sequence.view()
+
+    //
+    // MODULE: Obtain longest read in cluster
+    //
+    VSEARCH_SORT(
+
+    )
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
@@ -290,8 +355,6 @@ workflow CRISPRSEQ {
 
     Modules to implement:
 
-    vsearch
-    get_ubs
     top_read
     polishing: minimap2, racon
     consensus

From 181c3d5b2ec439a191edc981140a4c1f9c037abc Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 24 Mar 2023 11:16:52 +0100
Subject: [PATCH 08/45] try to make umi_to_sequence.py faster

---
 modules/local/umi_to_sequence.nf | 21 ++++++++++++++++
 templates/umi_to_sequence.py     | 11 +++++++++
 workflows/crisprseq.nf           | 41 +++++---------------------------
 3 files changed, 38 insertions(+), 35 deletions(-)
 create mode 100644 modules/local/umi_to_sequence.nf
 create mode 100644 templates/umi_to_sequence.py

diff --git a/modules/local/umi_to_sequence.nf b/modules/local/umi_to_sequence.nf
new file mode 100644
index 00000000..aa3be435
--- /dev/null
+++ b/modules/local/umi_to_sequence.nf
@@ -0,0 +1,21 @@
+process UMI_TO_SEQUENCE {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "conda-forge::p7zip==16.02"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/p7zip:16.02' :
+        'quay.io/biocontainers/p7zip:16.02' }"
+
+    input:
+    tuple val(meta), path(cluster)
+
+    output:
+    tuple val(meta), path("*.fasta"), emit: fasta
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'umi_to_sequence.py'
+}
diff --git a/templates/umi_to_sequence.py b/templates/umi_to_sequence.py
new file mode 100644
index 00000000..09a24bd0
--- /dev/null
+++ b/templates/umi_to_sequence.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+
+#### Extract the read sequence from a FASTA file of UMIs (output of extract_umis.py)
+#### author: Júlia Mir Pedrol @mirpedrol
+#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
+
+with open("$cluster") as i:
+    for line in i:
+        split = line.split(";")
+        with open("${cluster.baseName}_${task.ext.prefix}.fasta", "w") as o:
+            o.write(f"{line.split(';')[0]}\n{line.split('=')[-1]}")
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 48370acf..5d985cf1 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -80,32 +80,6 @@ include { CUTADAPT                                      } from '../modules/nf-co
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
 
 
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    DEFINE GROOVY FUNCTIONS
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-sufix = ""
-
-umi_to_sequence = { meta, cluster ->
-    cluster.withReader { source ->
-        output = file("${cluster.baseName}_${sufix}.fasta")
-        output.withWriter { target ->
-            String line
-            while ( line=source.readLine() ) {
-                if (line.startsWith(">")) {
-                    sequence = (line =~ /;seq=(.*$)/)[0][1]
-                    id = (line =~ /(>.*?);/)[0][1]
-                }
-                target << id + "\n" + sequence
-            }
-        }
-    }
-    return [meta, output]
-}
-
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -333,21 +307,18 @@ workflow CRISPRSEQ {
     .set{ ch_umi_bysize }
 
     // Get the correspondent fasta sequencences from single clusters
-    sufix = "consensus"
-
-    ch_umi_bysize.single
-    .map(umi_to_sequence)
-    .set{ ch_cluster_sequence }
-
+    //
+    UMI_TO_SEQUENCE (
+        ch_umi_bysize.single
+    )
 
-    ch_cluster_sequence.view()
 
     //
     // MODULE: Obtain longest read in cluster
     //
-    VSEARCH_SORT(
+    //VSEARCH_SORT(
 
-    )
+    //)
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:

From 2198371bf2bfcc06f493e8c46d675cbc6dea481e Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 24 Mar 2023 15:38:33 +0100
Subject: [PATCH 09/45] use a groovy function for umi_to_sequence

---
 modules/local/umi_to_sequence.nf | 21 --------------------
 templates/umi_to_sequence.py     | 11 -----------
 workflows/crisprseq.nf           | 34 +++++++++++++++++++++++++++-----
 3 files changed, 29 insertions(+), 37 deletions(-)
 delete mode 100644 modules/local/umi_to_sequence.nf
 delete mode 100644 templates/umi_to_sequence.py

diff --git a/modules/local/umi_to_sequence.nf b/modules/local/umi_to_sequence.nf
deleted file mode 100644
index aa3be435..00000000
--- a/modules/local/umi_to_sequence.nf
+++ /dev/null
@@ -1,21 +0,0 @@
-process UMI_TO_SEQUENCE {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "conda-forge::p7zip==16.02"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/p7zip:16.02' :
-        'quay.io/biocontainers/p7zip:16.02' }"
-
-    input:
-    tuple val(meta), path(cluster)
-
-    output:
-    tuple val(meta), path("*.fasta"), emit: fasta
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    template 'umi_to_sequence.py'
-}
diff --git a/templates/umi_to_sequence.py b/templates/umi_to_sequence.py
deleted file mode 100644
index 09a24bd0..00000000
--- a/templates/umi_to_sequence.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env python3
-
-#### Extract the read sequence from a FASTA file of UMIs (output of extract_umis.py)
-#### author: Júlia Mir Pedrol @mirpedrol
-#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
-
-with open("$cluster") as i:
-    for line in i:
-        split = line.split(";")
-        with open("${cluster.baseName}_${task.ext.prefix}.fasta", "w") as o:
-            o.write(f"{line.split(';')[0]}\n{line.split('=')[-1]}")
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 5d985cf1..fa768c23 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -48,7 +48,6 @@ include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file'
 include { ORIENT_REFERENCE                 } from '../modules/local/orient_reference'
 include { CIGAR_PARSER                     } from '../modules/local/cigar_parser'
 include { MERGING_SUMMARY                  } from '../modules/local/merging_summary'
-include { UMI_TO_SEQUENCE                  } from '../modules/local/umi_to_sequence'
 include { CLUSTERING_SUMMARY               } from '../modules/local/clustering_summary'
 include { ALIGNMENT_SUMMARY                } from '../modules/local/alignment_summary'
 include { TEMPLATE_REFERENCE               } from '../modules/local/template_reference'
@@ -80,6 +79,26 @@ include { CUTADAPT                                      } from '../modules/nf-co
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
 
 
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    DEFINE GROOVY FUNCTIONS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+def umi_to_sequence(cluster) {
+    cluster.withReader { source ->
+        String line
+        while ( line=source.readLine() ) {
+            if (line.startsWith(">")) {
+                sequence = (line =~ /;seq=(.*$)/)[0][1]
+                id = (line =~ /(>.*?);/)[0][1]
+            }
+        }
+    }
+    return id + "\n" + sequence
+}
+
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -307,10 +326,15 @@ workflow CRISPRSEQ {
     .set{ ch_umi_bysize }
 
     // Get the correspondent fasta sequencences from single clusters
-    //
-    UMI_TO_SEQUENCE (
-        ch_umi_bysize.single
-    )
+    ch_umi_bysize.single
+    .map{ meta, cluster ->
+        fasta_line = umi_to_sequence(cluster)
+        [meta, cluster.baseName, fasta_line]
+    }
+    .collectFile() { meta, name, fasta ->
+        [ "{$name}_consensus.fasta", fasta ]
+    }
+    .set{ ch_single_clusters_consensus }
 
 
     //

From 828dbf5729dcab6bca09b554216af80cafab14ed Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Wed, 5 Apr 2023 15:03:27 +0200
Subject: [PATCH 10/45] install vsearch/sort

---
 modules.json                          |  5 +++
 modules/nf-core/vsearch/sort/main.nf  | 47 +++++++++++++++++++++++++
 modules/nf-core/vsearch/sort/meta.yml | 49 +++++++++++++++++++++++++++
 workflows/crisprseq.nf                |  1 +
 4 files changed, 102 insertions(+)
 create mode 100644 modules/nf-core/vsearch/sort/main.nf
 create mode 100644 modules/nf-core/vsearch/sort/meta.yml

diff --git a/modules.json b/modules.json
index c741cfd1..afa23421 100644
--- a/modules.json
+++ b/modules.json
@@ -77,6 +77,11 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/vsearch/cluster/vsearch-cluster.diff"
+                    },
+                    "vsearch/sort": {
+                        "branch": "master",
+                        "git_sha": "2729a013f2c8930ea39d747e2fa589ca89d201ee",
+                        "installed_by": ["modules"]
                     }
                 }
             },
diff --git a/modules/nf-core/vsearch/sort/main.nf b/modules/nf-core/vsearch/sort/main.nf
new file mode 100644
index 00000000..5f410025
--- /dev/null
+++ b/modules/nf-core/vsearch/sort/main.nf
@@ -0,0 +1,47 @@
+process VSEARCH_SORT {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::vsearch=2.21.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/vsearch:2.21.1--h95f258a_0':
+        'quay.io/biocontainers/vsearch:2.21.1--h95f258a_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+    val sort_arg
+
+    output:
+    tuple val(meta), path("*_sorted.fasta"), emit: fasta
+    path "versions.yml"                    , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    if ("$fasta" == "${prefix}.fasta") error "Input and output names are the same, set prefix in module configuration to disambiguate!"
+    """
+    vsearch \\
+        $sort_arg $fasta \\
+        --threads $task.cpus \\
+        --output ${prefix}.fasta \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g;s/,.*//g;s/^v//;s/_.*//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${prefix}.fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g;s/,.*//g;s/^v//;s/_.*//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/vsearch/sort/meta.yml b/modules/nf-core/vsearch/sort/meta.yml
new file mode 100644
index 00000000..a527b7f1
--- /dev/null
+++ b/modules/nf-core/vsearch/sort/meta.yml
@@ -0,0 +1,49 @@
+name: "vsearch_sort"
+description: Sort fasta entries by decreasing abundance (--sortbysize) or sequence length (--sortbylength).
+keywords:
+  - vsearch/sort
+  - vsearch
+  - sort
+  - amplicon sequences
+  - metagenomics
+  - genomics
+  - population genetics
+tools:
+  - vsearch:
+      description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative)
+      homepage: https://github.com/torognes/vsearch
+      documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf
+      tool_dev_url: https://github.com/torognes/vsearch
+      doi: 10.7717/peerj.2584
+      licence: ["GPL v3-or-later OR BSD-2-clause"]
+
+input:
+  - meta:
+      type: map
+      description: Groovy Map containing sample information e.g. [ id:'test' ]
+  - fasta:
+      type: file
+      description: Sequences to be sorted in FASTA format
+      pattern: "*.{fasta,fa,fasta.gz,fa.gz}"
+  - sort_arg:
+      type: string
+      description: Argument to provide to sort algorithm. Sort by abundance with --sortbysize or by sequence length with --sortbylength.
+      enums: ["--sortbysize", "--sortbylength"]
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fasta:
+      type: file
+      description: Sorted FASTA file
+      pattern: "*.{fasta}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@mirpedrol"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index fa768c23..2cd4d084 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -69,6 +69,7 @@ include { PEAR                                          } from '../modules/nf-co
 include { CAT_FASTQ                                     } from '../modules/nf-core/cat/fastq/main'
 include { SEQTK_SEQ                                     } from '../modules/nf-core/seqtk/seq/main'
 include { VSEARCH_CLUSTER                               } from '../modules/nf-core/vsearch/cluster/main'
+include { VSEARCH_SORT                                  } from '../modules/nf-core/vsearch/sort/main'
 include { BOWTIE2_ALIGN                                 } from '../modules/nf-core/bowtie2/align/main'
 include { BOWTIE2_BUILD                                 } from '../modules/nf-core/bowtie2/build/main'
 include { BWA_MEM                                       } from '../modules/nf-core/bwa/mem/main'

From 731ffb3437e56fe0155408f7234576938504bccc Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Wed, 5 Apr 2023 15:13:21 +0200
Subject: [PATCH 11/45] add umi_bin_size parameter

---
 nextflow.config      |  3 +++
 nextflow_schema.json | 22 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 1aaf3c0e..be5bd498 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,6 +13,9 @@ params {
     input                      = null
     aligner                    = 'minimap2'
 
+    // UMI parameters
+    umi_bin_size               = 1
+
     // Vsearch options
     vsearch_minseqlength       = 55
     vsearch_maxseqlength       = 57
diff --git a/nextflow_schema.json b/nextflow_schema.json
index efad63df..74b49103 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -42,10 +42,25 @@
                 }
             }
         },
+        "umi_parameters": {
+            "title": "UMI parameters",
+            "type": "object",
+            "description": "Parameters regarding umi molecular identifiers (UMIs)",
+            "default": "",
+            "properties": {
+                "umi_bin_size": {
+                    "type": "integer",
+                    "default": 1,
+                    "description": "Minimum size of a UMI cluster.",
+                    "fa_icon": "fas fa-sort-amount-down"
+                }
+            },
+            "fa_icon": "fas fa-layer-group"
+        },
         "alignment_parameters": {
             "title": "Alignment parameters",
             "type": "object",
-            "description": "",
+            "description": "Parameters used for alignment processes",
             "default": "",
             "properties": {
                 "aligner": {
@@ -62,7 +77,7 @@
         "vsearch_parameters": {
             "title": "Vsearch parameters",
             "type": "object",
-            "description": "",
+            "description": "Parameters to use in Vsearch processes",
             "default": "",
             "properties": {
                 "vsearch_minseqlength": {
@@ -322,6 +337,9 @@
         {
             "$ref": "#/definitions/input_output_options"
         },
+        {
+            "$ref": "#/definitions/umi_parameters"
+        },
         {
             "$ref": "#/definitions/alignment_parameters"
         },

From 46535d8574a9deb8f3854bb8cd19612cee132e4a Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Thu, 6 Apr 2023 09:57:15 +0200
Subject: [PATCH 12/45] update vsearch/sort and add sequence obtention from umi

---
 conf/modules.config                  |  6 +++
 modules.json                         |  2 +-
 modules/nf-core/vsearch/sort/main.nf |  4 +-
 workflows/crisprseq.nf               | 58 +++++++++++++++++++++++++---
 4 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index f4a637a7..e07f587b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -111,6 +111,12 @@ process {
         ext.args3 = '--clusters'
     }
 
+
+    withName: VSEARCH_SORT {
+        ext.args = '--topn 1'
+        ext.prefix = { "${fasta.baseName}_top" }
+    }
+
     withName: MERGING_SUMMARY {
         publishDir = [
             path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
diff --git a/modules.json b/modules.json
index afa23421..5fe66a88 100644
--- a/modules.json
+++ b/modules.json
@@ -80,7 +80,7 @@
                     },
                     "vsearch/sort": {
                         "branch": "master",
-                        "git_sha": "2729a013f2c8930ea39d747e2fa589ca89d201ee",
+                        "git_sha": "e7801603532df26b4bb4ef324ca2c39f7a4d0eee",
                         "installed_by": ["modules"]
                     }
                 }
diff --git a/modules/nf-core/vsearch/sort/main.nf b/modules/nf-core/vsearch/sort/main.nf
index 5f410025..a6f06535 100644
--- a/modules/nf-core/vsearch/sort/main.nf
+++ b/modules/nf-core/vsearch/sort/main.nf
@@ -12,8 +12,8 @@ process VSEARCH_SORT {
     val sort_arg
 
     output:
-    tuple val(meta), path("*_sorted.fasta"), emit: fasta
-    path "versions.yml"                    , emit: versions
+    tuple val(meta), path("*.fasta"), emit: fasta
+    path "versions.yml"             , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 2cd4d084..3271407a 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -75,6 +75,7 @@ include { BOWTIE2_BUILD                                 } from '../modules/nf-co
 include { BWA_MEM                                       } from '../modules/nf-core/bwa/mem/main'
 include { BWA_INDEX                                     } from '../modules/nf-core/bwa/index/main'
 include { MINIMAP2_ALIGN                                } from '../modules/nf-core/minimap2/align/main'
+include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI          } from '../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE     } from '../modules/nf-core/minimap2/align/main'
 include { CUTADAPT                                      } from '../modules/nf-core/cutadapt/main'
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
@@ -99,6 +100,20 @@ def umi_to_sequence(cluster) {
     return id + "\n" + sequence
 }
 
+def umi_to_sequence_centroid(cluster) {
+    cluster.withReader { source ->
+        String line
+        while ( line=source.readLine() ) {
+            if (line.startsWith(">")) {
+                sequence = (line =~ /;seq=(.*$)/)[0][1]
+                id = (line =~ /(>.*?);/)[0][1]
+            }
+        }
+    }
+    id = id.replace(">", ">centroid_")
+    return id + "\n" + sequence
+}
+
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -319,9 +334,9 @@ workflow CRISPRSEQ {
     .transpose()
     .branch{
         meta, cluster ->
-            single: cluster.countFasta() == 1
+            single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1
                 return [meta, cluster]
-            cluster: cluster.countFasta() > 1
+            cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1
                 return [meta, cluster]
     }
     .set{ ch_umi_bysize }
@@ -339,11 +354,43 @@ workflow CRISPRSEQ {
 
 
     //
-    // MODULE: Obtain longest read in cluster
+    // MODULE: Obtain most abundant UMI in cluster
+    //
+    VSEARCH_SORT(
+        ch_umi_bysize.cluster,
+        Channel.value("--sortbysize")
+    )
+
+    // Get the correspondent fasta sequencences from top cluster sequences
+    // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
+    VSEARCH_SORT.out.fasta
+    .map{ meta, fasta ->
+        fasta_line = umi_to_sequence_centroid(fasta)
+        [meta, fasta.baseName, fasta_line]
+    }
+    .collectFile() { meta, name, fasta ->
+        [ "{$name}.fasta", fasta ]
+    }
+    .set{ ch_top_clusters_sequence }
+
+    // Get the correspondent fasta sequencences from UMI clusters
+    ch_umi_bysize.cluster
+    .map{ meta, cluster ->
+        fasta_line = umi_to_sequence(cluster)
+        [meta, cluster.baseName, fasta_line]
+    }
+    .collectFile() { meta, name, fasta ->
+        [ "{$name}_sequences.fasta", fasta ]
+    }
+    .set{ ch_clusters_sequence }
+
+    //
+    // MODULE: Mapping with minimap2
     //
-    //VSEARCH_SORT(
+    // Map each cluster against the top read (most abundant UMI) in the cluster
+    MINIMAP2_ALIGN_UMI (
 
-    //)
+    )
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
@@ -351,7 +398,6 @@ workflow CRISPRSEQ {
 
     Modules to implement:
 
-    top_read
     polishing: minimap2, racon
     consensus
     join_reads

From 75584e27a306eb4735f6e73a1121b47cfec0c050 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Thu, 6 Apr 2023 10:11:14 +0200
Subject: [PATCH 13/45] add first minimap step for UMI consensus

---
 conf/modules.config    |  5 +++++
 workflows/crisprseq.nf | 12 +++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index e07f587b..a4922cb5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -125,6 +125,11 @@ process {
         ]
     }
 
+    withName: MINIMAP2_ALIGN_UMI {
+        ext.args = '-x map-ont'
+        ext.prefix = { "${reads.baseName}_cycle1" }
+    }
+
     withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 3271407a..cf4072fa 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -384,14 +384,24 @@ workflow CRISPRSEQ {
     }
     .set{ ch_clusters_sequence }
 
+    ch_clusters_sequence
+    .join(ch_top_clusters_sequence)
+    .set{ ch_cluster_reference }
+
     //
     // MODULE: Mapping with minimap2
     //
     // Map each cluster against the top read (most abundant UMI) in the cluster
     MINIMAP2_ALIGN_UMI (
-
+        ch_clusters_sequence
+            .join(ch_top_clusters_sequence),
+        false, //output in paf format
+        false,
+        false
     )
 
+
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 

From 419962829886c593251a9c5389d3d69502e724ce Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Thu, 6 Apr 2023 10:12:25 +0200
Subject: [PATCH 14/45] install new module racon

---
 modules.json                   |  5 ++++
 modules/nf-core/racon/main.nf  | 38 +++++++++++++++++++++++++
 modules/nf-core/racon/meta.yml | 52 ++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 modules/nf-core/racon/main.nf
 create mode 100644 modules/nf-core/racon/meta.yml

diff --git a/modules.json b/modules.json
index 5fe66a88..78a82edf 100644
--- a/modules.json
+++ b/modules.json
@@ -62,6 +62,11 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "racon": {
+                        "branch": "master",
+                        "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/index": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf
new file mode 100644
index 00000000..f8ed3691
--- /dev/null
+++ b/modules/nf-core/racon/main.nf
@@ -0,0 +1,38 @@
+process RACON {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::racon=1.4.20"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/racon:1.4.20--h9a82719_1' :
+        'quay.io/biocontainers/racon:1.4.20--h9a82719_1' }"
+
+    input:
+    tuple val(meta), path(reads), path(assembly), path(paf)
+
+    output:
+    tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly
+    path "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    racon -t "$task.cpus" \\
+        "${reads}" \\
+        "${paf}" \\
+        $args \\
+        "${assembly}" > \\
+        ${prefix}_assembly_consensus.fasta
+
+    gzip -n ${prefix}_assembly_consensus.fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        racon: \$( racon --version 2>&1 | sed 's/^.*v//' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/racon/meta.yml b/modules/nf-core/racon/meta.yml
new file mode 100644
index 00000000..2e7737d9
--- /dev/null
+++ b/modules/nf-core/racon/meta.yml
@@ -0,0 +1,52 @@
+name: racon
+description: Consensus module for raw de novo DNA assembly of long uncorrected reads
+keywords:
+  - assembly
+  - pacbio
+  - nanopore
+  - polish
+tools:
+  - racon:
+      description: Ultrafast consensus module for raw de novo genome assembly of long uncorrected reads.
+      homepage: https://github.com/lbcb-sci/racon
+      documentation: https://github.com/lbcb-sci/racon
+      tool_dev_url: https://github.com/lbcb-sci/racon
+      doi: 10.1101/gr.214270.116
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: List of input FastQ files. Racon expects single end reads
+      pattern: "*.{fastq,fastq.gz,fq,fq.gz}"
+  - assembly:
+      type: file
+      description: Genome assembly to be improved
+      pattern: "*.{fasta,fa}"
+  - paf:
+      type: file
+      description: Alignment in PAF format
+      pattern: "*.paf"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - improved_assembly:
+      type: file
+      description: Improved genome assembly
+      pattern: "*_assembly_consensus.fasta.gz"
+
+authors:
+  - "@avantonder"

From e9adce876216ba6dde4f99ca12983b8815a5ee54 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Thu, 6 Apr 2023 13:57:58 +0200
Subject: [PATCH 15/45] add racon

---
 conf/modules.config    | 17 +++++++-
 workflows/crisprseq.nf | 88 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 91 insertions(+), 14 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index a4922cb5..94c79519 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -125,11 +125,26 @@ process {
         ]
     }
 
-    withName: MINIMAP2_ALIGN_UMI {
+    withName: MINIMAP2_ALIGN_UMI_1 {
         ext.args = '-x map-ont'
         ext.prefix = { "${reads.baseName}_cycle1" }
     }
 
+    withName: MINIMAP2_ALIGN_UMI_2 {
+        ext.args = '-x map-ont'
+        ext.prefix = { "${reads.baseName}_cycle2" }
+    }
+
+    withName: RACON_1 {
+        ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming'
+        ext.prefix = { "${reads.baseName}_cycle1" }
+    }
+
+    withName: RACON_2 {
+        ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming'
+        ext.prefix = { "${reads.baseName}_cycle2" }
+    }
+
     withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index cf4072fa..f2f352a7 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -70,12 +70,15 @@ include { CAT_FASTQ                                     } from '../modules/nf-co
 include { SEQTK_SEQ                                     } from '../modules/nf-core/seqtk/seq/main'
 include { VSEARCH_CLUSTER                               } from '../modules/nf-core/vsearch/cluster/main'
 include { VSEARCH_SORT                                  } from '../modules/nf-core/vsearch/sort/main'
+include { RACON as RACON_1                              } from '../modules/nf-core/racon/main'
+include { RACON as RACON_2                              } from '../modules/nf-core/racon/main'
 include { BOWTIE2_ALIGN                                 } from '../modules/nf-core/bowtie2/align/main'
 include { BOWTIE2_BUILD                                 } from '../modules/nf-core/bowtie2/build/main'
 include { BWA_MEM                                       } from '../modules/nf-core/bwa/mem/main'
 include { BWA_INDEX                                     } from '../modules/nf-core/bwa/index/main'
 include { MINIMAP2_ALIGN                                } from '../modules/nf-core/minimap2/align/main'
-include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI          } from '../modules/nf-core/minimap2/align/main'
+include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1        } from '../modules/nf-core/minimap2/align/main'
+include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2        } from '../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE     } from '../modules/nf-core/minimap2/align/main'
 include { CUTADAPT                                      } from '../modules/nf-core/cutadapt/main'
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
@@ -343,16 +346,26 @@ workflow CRISPRSEQ {
 
     // Get the correspondent fasta sequencences from single clusters
     ch_umi_bysize.single
+    .tap{ meta_channel }
     .map{ meta, cluster ->
         fasta_line = umi_to_sequence(cluster)
         [meta, cluster.baseName, fasta_line]
     }
     .collectFile() { meta, name, fasta ->
-        [ "{$name}_consensus.fasta", fasta ]
+        [ "${name}_consensus.fasta", fasta ]
+    }
+    .map{ new_file ->
+        [new_file.baseName, new_file]
+    }
+    .join(meta_channel
+        .map { meta, original_file ->
+            ["${original_file.baseName}_consensus", meta]
+        })
+    .map{ file_name, new_file, meta ->
+        [meta, new_file]
     }
     .set{ ch_single_clusters_consensus }
 
-
     //
     // MODULE: Obtain most abundant UMI in cluster
     //
@@ -364,35 +377,56 @@ workflow CRISPRSEQ {
     // Get the correspondent fasta sequencences from top cluster sequences
     // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
     VSEARCH_SORT.out.fasta
+    .tap{ meta_channel_2 }
     .map{ meta, fasta ->
         fasta_line = umi_to_sequence_centroid(fasta)
         [meta, fasta.baseName, fasta_line]
     }
     .collectFile() { meta, name, fasta ->
-        [ "{$name}.fasta", fasta ]
+        [ "${name}.fasta", fasta ]
+    }
+    .map{ new_file ->
+        [new_file.baseName[0..-5], new_file] // Substring is removing "_top" added by VSEARCH_SORT
+    }
+    .join(meta_channel_2
+        .map { meta, original_file ->
+            ["${original_file.baseName}"[0..-5], meta] // Substring is removing "_top" added by VSEARCH_SORT
+        })
+    .map{ file_name, new_file, meta ->
+        [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map
     }
     .set{ ch_top_clusters_sequence }
 
     // Get the correspondent fasta sequencences from UMI clusters
     ch_umi_bysize.cluster
+    .tap{ meta_channel_3 }
     .map{ meta, cluster ->
         fasta_line = umi_to_sequence(cluster)
         [meta, cluster.baseName, fasta_line]
     }
     .collectFile() { meta, name, fasta ->
-        [ "{$name}_sequences.fasta", fasta ]
+        [ "${name}_sequences.fasta", fasta ]
+    }
+    .map{ new_file ->
+        [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile
+    }
+    .join(meta_channel_3
+        .map { meta, original_file ->
+            ["${original_file.baseName}", meta]
+        })
+    .map{ file_name, new_file, meta ->
+        [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map
     }
     .set{ ch_clusters_sequence }
 
-    ch_clusters_sequence
-    .join(ch_top_clusters_sequence)
-    .set{ ch_cluster_reference }
+    // Cluster consensus & polishing
+    // Two cycles of minimap2 + racon
 
     //
-    // MODULE: Mapping with minimap2
+    // MODULE: Mapping with minimap2 - cycle 1
     //
     // Map each cluster against the top read (most abundant UMI) in the cluster
-    MINIMAP2_ALIGN_UMI (
+    MINIMAP2_ALIGN_UMI_1 (
         ch_clusters_sequence
             .join(ch_top_clusters_sequence),
         false, //output in paf format
@@ -400,7 +434,35 @@ workflow CRISPRSEQ {
         false
     )
 
+    //
+    // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2
+    //
+    RACON_1 (
+        ch_clusters_sequence
+            .join(ch_top_clusters_sequence)
+            .join(MINIMAP2_ALIGN_UMI_1.out.paf)
+    )
 
+    //
+    // MODULE: Mapping with minimap2 - cycle 2
+    //
+    // Map each cluster against the top read (most abundant UMI) in the cluster
+    MINIMAP2_ALIGN_UMI_2 (
+        ch_clusters_sequence
+            .join(RACON_1.out.improved_assembly),
+        false, //output in paf format
+        false,
+        false
+    )
+
+    //
+    // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2
+    //
+    RACON_2 (
+        ch_clusters_sequence
+            .join(RACON_1.out.improved_assembly)
+            .join(MINIMAP2_ALIGN_UMI_2.out.paf)
+    )
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
@@ -561,9 +623,9 @@ workflow CRISPRSEQ {
     //
     // MODULE: Parse cigar to find edits
     //
-    CIGAR_PARSER (
-        ch_to_parse_cigar
-    )
+    //CIGAR_PARSER (
+    //    ch_to_parse_cigar
+    //)
 
 
     //

From a85fc451e22a22a234ef32197a6585fa74d4b6c0 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 5 May 2023 09:03:50 +0200
Subject: [PATCH 16/45] fix bug returning only one sequence from UMI cluster
 and filter UMI alignments to ignore empty alignments

---
 workflows/crisprseq.nf | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index f2f352a7..5c7d1355 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -93,14 +93,16 @@ include { SAMTOOLS_INDEX                                } from '../modules/nf-co
 def umi_to_sequence(cluster) {
     cluster.withReader { source ->
         String line
+        sequences = ""
         while ( line=source.readLine() ) {
             if (line.startsWith(">")) {
                 sequence = (line =~ /;seq=(.*$)/)[0][1]
                 id = (line =~ /(>.*?);/)[0][1]
+                sequences = sequences + id + "\n" + sequence + "\n"
             }
         }
     }
-    return id + "\n" + sequence
+    return sequences
 }
 
 def umi_to_sequence_centroid(cluster) {
@@ -434,13 +436,18 @@ workflow CRISPRSEQ {
         false
     )
 
+    // Only continue with clusters that have aligned sequences
+    MINIMAP2_ALIGN_UMI_1.out.paf
+        .filter{ it[1].countLines() > 0 }
+        .set{ ch_minimap_1 }
+
     //
-    // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2
+    // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1
     //
     RACON_1 (
         ch_clusters_sequence
             .join(ch_top_clusters_sequence)
-            .join(MINIMAP2_ALIGN_UMI_1.out.paf)
+            .join(ch_minimap_1)
     )
 
     //
@@ -455,13 +462,18 @@ workflow CRISPRSEQ {
         false
     )
 
+    // Only continue with clusters that have aligned sequences
+    MINIMAP2_ALIGN_UMI_2.out.paf
+        .filter{ it[1].countLines() > 0 }
+        .set{ ch_minimap_2 }
+
     //
     // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2
     //
     RACON_2 (
         ch_clusters_sequence
             .join(RACON_1.out.improved_assembly)
-            .join(MINIMAP2_ALIGN_UMI_2.out.paf)
+            .join(ch_minimap_2)
     )
 
     /*

From 424ee03534b1a9070fded4e2154b3f0142e73dae Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 5 May 2023 09:04:57 +0200
Subject: [PATCH 17/45] update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 196d4589..9b303fc1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add UMI clustering to crisprseq-edition
+
 ### Fixed
 
 ### Dependencies

From 07e21ffa215afa5dbd20a84dc429007403b85e1c Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 5 May 2023 10:15:53 +0200
Subject: [PATCH 18/45] add test_umi tests to CI

---
 .github/workflows/ci.yml | 23 +++++++++++++++++++++++
 CHANGELOG.md             |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c6c9113a..e858ddf9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,3 +41,26 @@ jobs:
         # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+
+  umis:
+    name: Run pipeline with UMI clustering test data
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/crisprseq') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "22.10.1"
+          - "latest-everything"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v3
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Run pipeline with UMI clustering test data
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_umis,docker --outdir ./results
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9b303fc1..ecad079f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Add UMI clustering to crisprseq-edition
+- Add UMI clustering to crisprseq-edition ([#24](https://github.com/nf-core/crisprseq/pull/24))
 
 ### Fixed
 

From 4ff306e7f08ca03721bcbd904d4ffa5b58c31130 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 5 May 2023 11:10:33 +0200
Subject: [PATCH 19/45] install medaka

---
 modules.json                    |  5 ++++
 modules/nf-core/medaka/main.nf  | 40 ++++++++++++++++++++++++++++
 modules/nf-core/medaka/meta.yml | 47 +++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+)
 create mode 100644 modules/nf-core/medaka/main.nf
 create mode 100644 modules/nf-core/medaka/meta.yml

diff --git a/modules.json b/modules.json
index 78a82edf..7e1984c3 100644
--- a/modules.json
+++ b/modules.json
@@ -46,6 +46,11 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "medaka": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
                     "minimap2/align": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf
new file mode 100644
index 00000000..d60915ee
--- /dev/null
+++ b/modules/nf-core/medaka/main.nf
@@ -0,0 +1,40 @@
+process MEDAKA {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "bioconda::medaka=1.4.4"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/medaka:1.4.4--py38h130def0_0' :
+        'biocontainers/medaka:1.4.4--py38h130def0_0' }"
+
+    input:
+    tuple val(meta), path(reads), path(assembly)
+
+    output:
+    tuple val(meta), path("*.fa.gz"), emit: assembly
+    path "versions.yml"             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    medaka_consensus \\
+        -t $task.cpus \\
+        $args \\
+        -i $reads \\
+        -d $assembly \\
+        -o ./
+
+    mv consensus.fasta ${prefix}.fa
+
+    gzip -n ${prefix}.fa
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/medaka/meta.yml b/modules/nf-core/medaka/meta.yml
new file mode 100644
index 00000000..ed124d61
--- /dev/null
+++ b/modules/nf-core/medaka/meta.yml
@@ -0,0 +1,47 @@
+name: medaka
+description: A tool to create consensus sequences and variant calls from nanopore sequencing data
+keywords:
+  - assembly
+  - polishing
+  - nanopore
+tools:
+  - medaka:
+      description: Neural network sequence error correction.
+      homepage: https://nanoporetech.github.io/medaka/index.html
+      documentation: https://nanoporetech.github.io/medaka/index.html
+      tool_dev_url: https://github.com/nanoporetech/medaka
+
+      licence: ["Mozilla Public License 2.0"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: List of input nanopore fasta/FastQ files
+      pattern: "*.{fasta,fa,fastq,fastq.gz,fq,fq.gz}"
+  - assembly:
+      type: file
+      description: Genome assembly
+      pattern: "*.{fasta,fa}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - assembly:
+      type: file
+      description: Polished genome assembly
+      pattern: "*.fa.gz"
+
+authors:
+  - "@avantonder"

From ea19682feca0e71714f4c744b32a842f6920f155 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Fri, 5 May 2023 11:19:57 +0200
Subject: [PATCH 20/45] install samtools/faidx

---
 modules.json                            |  5 +++
 modules/nf-core/samtools/faidx/main.nf  | 44 +++++++++++++++++++++++
 modules/nf-core/samtools/faidx/meta.yml | 47 +++++++++++++++++++++++++
 workflows/crisprseq.nf                  |  4 ++-
 4 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 modules/nf-core/samtools/faidx/main.nf
 create mode 100644 modules/nf-core/samtools/faidx/meta.yml

diff --git a/modules.json b/modules.json
index 7e1984c3..34cbfba7 100644
--- a/modules.json
+++ b/modules.json
@@ -72,6 +72,11 @@
                         "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
                         "installed_by": ["modules"]
                     },
+                    "samtools/faidx": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/index": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf
new file mode 100644
index 00000000..4dd0e5b0
--- /dev/null
+++ b/modules/nf-core/samtools/faidx/main.nf
@@ -0,0 +1,44 @@
+process SAMTOOLS_FAIDX {
+    tag "$fasta"
+    label 'process_single'
+
+    conda "bioconda::samtools=1.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
+        'biocontainers/samtools:1.17--h00cdaf9_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+
+    output:
+    tuple val(meta), path ("*.fai"), emit: fai
+    tuple val(meta), path ("*.gzi"), emit: gzi, optional: true
+    path "versions.yml"            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    samtools \\
+        faidx \\
+        $args \\
+        $fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${fasta}.fai
+    cat <<-END_VERSIONS > versions.yml
+
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml
new file mode 100644
index 00000000..fe2fe9a1
--- /dev/null
+++ b/modules/nf-core/samtools/faidx/meta.yml
@@ -0,0 +1,47 @@
+name: samtools_faidx
+description: Index FASTA file
+keywords:
+  - index
+  - fasta
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fasta:
+      type: file
+      description: FASTA file
+      pattern: "*.{fa,fasta}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fai:
+      type: file
+      description: FASTA index file
+      pattern: "*.{fai}"
+  - gzi:
+      type: file
+      description: Optional gzip index file for compressed inputs
+      pattern: "*.gzi"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@ewels"
+  - "@phue"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 5c7d1355..267ee346 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -476,13 +476,15 @@ workflow CRISPRSEQ {
             .join(ch_minimap_2)
     )
 
+
+
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 
 
     Modules to implement:
 
-    polishing: minimap2, racon
     consensus
     join_reads
     fa2fq

From 9bf010576b4d80c030ce39ee04d2cf9f5e474918 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 8 May 2023 16:19:44 +0200
Subject: [PATCH 21/45] run faidx

---
 workflows/crisprseq.nf | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 267ee346..98404560 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -80,6 +80,7 @@ include { MINIMAP2_ALIGN                                } from '../modules/nf-co
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1        } from '../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2        } from '../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE     } from '../modules/nf-core/minimap2/align/main'
+include { SAMTOOLS_FAIDX                                } from '../modules/nf-core/samtools/faidx/main'
 include { CUTADAPT                                      } from '../modules/nf-core/cutadapt/main'
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
 
@@ -477,7 +478,12 @@ workflow CRISPRSEQ {
     )
 
 
-
+    //
+    // MODULE: Convert reference .fa to .fai
+    //
+    SAMTOOLS_FAIDX (
+        RACON_2.out.improved_assembly
+    )
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:

From ebf6fd5a202fad494357ecc7ae436c3890ca0791 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <qeami01@rddsws.am10.uni-tuebingen.de>
Date: Tue, 9 May 2023 10:36:50 +0200
Subject: [PATCH 22/45] add minimap and samtools for mini_align process

---
 conf/modules.config                     |  8 +++
 modules.json                            |  5 ++
 modules/local/modified_minimap2.nf      | 47 ++++++++++++++++++
 modules/nf-core/minimap2/index/main.nf  | 34 +++++++++++++
 modules/nf-core/minimap2/index/meta.yml | 40 +++++++++++++++
 workflows/crisprseq.nf                  | 66 +++++++++++++++++++------
 6 files changed, 185 insertions(+), 15 deletions(-)
 create mode 100644 modules/local/modified_minimap2.nf
 create mode 100644 modules/nf-core/minimap2/index/main.nf
 create mode 100644 modules/nf-core/minimap2/index/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 94c79519..9a9c9978 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -145,6 +145,14 @@ process {
         ext.prefix = { "${reads.baseName}_cycle2" }
     }
 
+    WithName: MINIMAP2_INDEX {
+        ext.args = '-I 16G -x map-ont'
+    }
+
+    withName: MINIMAP2_ALIGN_CONSENSUS_1 {
+        ext.args = '-x map-ont --secondary=no --MD'
+    }
+
     withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
diff --git a/modules.json b/modules.json
index 34cbfba7..456ae16b 100644
--- a/modules.json
+++ b/modules.json
@@ -57,6 +57,11 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/minimap2/align/minimap2-align.diff"
                     },
+                    "minimap2/index": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba",
diff --git a/modules/local/modified_minimap2.nf b/modules/local/modified_minimap2.nf
new file mode 100644
index 00000000..12e12374
--- /dev/null
+++ b/modules/local/modified_minimap2.nf
@@ -0,0 +1,47 @@
+process MODIFIED_MINIMAP2 {
+    tag "$meta.id"
+    label 'process_medium'
+
+    // Note: the versions here need to match the versions used in the mulled container below and minimap2/index
+    conda "bioconda::minimap2=2.24 bioconda::samtools=1.14"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' :
+        'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }"
+
+    input:
+    tuple val(meta), path(reads), path(reference)
+    val bam_format
+    val cigar_paf_format
+    val cigar_bam
+
+    output:
+    tuple val(meta), path("*.paf"), optional: true, emit: paf
+    tuple val(meta), path("*.bam"), optional: true, emit: bam
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def bam_output = bam_format ? "-a | samtools view -@ ${task.cpus} -T ${reference} -F 2308 -bS - | samtools sort -@ ${task.cpus} -o ${prefix}.bam" : "-o ${prefix}.paf"
+    def cigar_paf = cigar_paf_format && !bam_format ? "-c" : ''
+    def set_cigar_bam = cigar_bam && bam_format ? "-L" : ''
+    """
+    minimap2 \\
+        $args \\
+        -t $task.cpus \\
+        "${reference ?: reads}" \\
+        "$reads" \\
+        $cigar_paf \\
+        $set_cigar_bam \\
+        $bam_output
+
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        minimap2: \$(minimap2 --version 2>&1)
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf
new file mode 100644
index 00000000..7a1bb227
--- /dev/null
+++ b/modules/nf-core/minimap2/index/main.nf
@@ -0,0 +1,34 @@
+process MINIMAP2_INDEX {
+    label 'process_medium'
+
+    // Note: the versions here need to match the versions used in minimap2/align
+    conda "bioconda::minimap2=2.24"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/minimap2:2.24--h7132678_1' :
+        'biocontainers/minimap2:2.24--h7132678_1' }"
+
+    input:
+    tuple val(meta), path(fasta)
+
+    output:
+    tuple val(meta), path("*.mmi"), emit: index
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    minimap2 \\
+        -t $task.cpus \\
+        -d ${fasta.baseName}.mmi \\
+        $args \\
+        $fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        minimap2: \$(minimap2 --version 2>&1)
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml
new file mode 100644
index 00000000..b58f35c6
--- /dev/null
+++ b/modules/nf-core/minimap2/index/meta.yml
@@ -0,0 +1,40 @@
+name: minimap2_index
+description: Provides fasta index required by minimap2 alignment.
+keywords:
+  - index
+  - fasta
+  - reference
+tools:
+  - minimap2:
+      description: |
+        A versatile pairwise aligner for genomic and spliced nucleotide sequences.
+      homepage: https://github.com/lh3/minimap2
+      documentation: https://github.com/lh3/minimap2#uguide
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fasta:
+      type: file
+      description: |
+        Reference database in FASTA format.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - index:
+      type: file
+      description: Minimap2 fasta index.
+      pattern: "*.mmi"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@yuukiiwa"
+  - "@drpatelh"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 98404560..d9cdacf6 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -41,17 +41,18 @@ include { INPUT_CHECK                      } from '../subworkflows/local/input_c
 //
 // MODULE
 //
-include { FIND_ADAPTERS                    } from '../modules/local/find_adapters'
-include { EXTRACT_UMIS                     } from '../modules/local/extract_umis'
-include { SEQ_TO_FILE as SEQ_TO_FILE_REF   } from '../modules/local/seq_to_file'
-include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file'
-include { ORIENT_REFERENCE                 } from '../modules/local/orient_reference'
-include { CIGAR_PARSER                     } from '../modules/local/cigar_parser'
-include { MERGING_SUMMARY                  } from '../modules/local/merging_summary'
-include { CLUSTERING_SUMMARY               } from '../modules/local/clustering_summary'
-include { ALIGNMENT_SUMMARY                } from '../modules/local/alignment_summary'
-include { TEMPLATE_REFERENCE               } from '../modules/local/template_reference'
-include { DUMMY_FINAL_UMI                  } from '../modules/local/dummy_final_umi'
+include { FIND_ADAPTERS                                   } from '../modules/local/find_adapters'
+include { EXTRACT_UMIS                                    } from '../modules/local/extract_umis'
+include { SEQ_TO_FILE as SEQ_TO_FILE_REF                  } from '../modules/local/seq_to_file'
+include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL                } from '../modules/local/seq_to_file'
+include { ORIENT_REFERENCE                                } from '../modules/local/orient_reference'
+include { CIGAR_PARSER                                    } from '../modules/local/cigar_parser'
+include { MERGING_SUMMARY                                 } from '../modules/local/merging_summary'
+include { CLUSTERING_SUMMARY                              } from '../modules/local/clustering_summary'
+include { ALIGNMENT_SUMMARY                               } from '../modules/local/alignment_summary'
+include { TEMPLATE_REFERENCE                              } from '../modules/local/template_reference'
+include { DUMMY_FINAL_UMI                                 } from '../modules/local/dummy_final_umi'
+include { MODIFIED_MINIMAP2 as MINIMAP2_ALIGN_CONSENSUS_1 } from '../modules/local/modified_minimap2'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -81,8 +82,10 @@ include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1        } from '../modules/nf-co
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2        } from '../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE     } from '../modules/nf-core/minimap2/align/main'
 include { SAMTOOLS_FAIDX                                } from '../modules/nf-core/samtools/faidx/main'
+include { MINIMAP2_INDEX                                } from '../modules/nf-core/minimap2/index/main'
 include { CUTADAPT                                      } from '../modules/nf-core/cutadapt/main'
-include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_CONSENSUS_1  } from '../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_ALIGNMENT    } from '../modules/nf-core/samtools/index/main'
 
 
 /*
@@ -485,6 +488,39 @@ workflow CRISPRSEQ {
         RACON_2.out.improved_assembly
     )
 
+
+    //
+    // MODULE: Indexing the reference file
+    //
+    MINIMAP2_INDEX (
+        RACON_2.out.improved_assembly
+    )
+
+    //
+    // MODULE: Alignment to obtain the consensus of an UMI cluster
+    //
+    MINIMAP2_ALIGN_CONSENSUS_1 {
+        ch_clusters_sequence
+            .join(MINIMAP2_INDEX.out.index),
+        true,
+        false,
+        true
+    }
+
+    //
+    // MODULE: Obtain .bam.bai files
+    //
+    SAMTOOLS_INDEX_CONSENSUS_1 (
+        ch_mapped_bam
+    )
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_CONSENSUS_1.out.versions)
+
+
+    //
+    // MODULE:
+    //
+    
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 
@@ -574,10 +610,10 @@ workflow CRISPRSEQ {
     //
     // MODULE: Obtain .bam.bai files
     //
-    SAMTOOLS_INDEX (
+    SAMTOOLS_INDEX_ALIGNMENT (
         ch_mapped_bam
     )
-    ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions)
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_ALIGNMENT.out.versions)
 
     //
     // MODULE: Obtain a new reference with the template modification
@@ -613,7 +649,7 @@ workflow CRISPRSEQ {
     ch_versions = ch_versions.mix(MINIMAP2_ALIGN_TEMPLATE.out.versions)
 
     ch_mapped_bam
-        .join(SAMTOOLS_INDEX.out.bai)
+        .join(SAMTOOLS_INDEX_ALIGNMENT.out.bai)
         .join(ORIENT_REFERENCE.out.reference)
         .join(INPUT_CHECK.out.protospacer
             .map {

From d883f1529f7900bc50d26ee0b95034745eacda62 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <qeami01@rddsws.am10.uni-tuebingen.de>
Date: Tue, 9 May 2023 10:48:33 +0200
Subject: [PATCH 23/45] remove mini_align as it's now performed by
 medaka_consensus

---
 conf/modules.config                |  8 -----
 modules/local/modified_minimap2.nf | 47 -----------------------------
 workflows/crisprseq.nf             | 48 +++++-------------------------
 3 files changed, 7 insertions(+), 96 deletions(-)
 delete mode 100644 modules/local/modified_minimap2.nf

diff --git a/conf/modules.config b/conf/modules.config
index 9a9c9978..94c79519 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -145,14 +145,6 @@ process {
         ext.prefix = { "${reads.baseName}_cycle2" }
     }
 
-    WithName: MINIMAP2_INDEX {
-        ext.args = '-I 16G -x map-ont'
-    }
-
-    withName: MINIMAP2_ALIGN_CONSENSUS_1 {
-        ext.args = '-x map-ont --secondary=no --MD'
-    }
-
     withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
diff --git a/modules/local/modified_minimap2.nf b/modules/local/modified_minimap2.nf
deleted file mode 100644
index 12e12374..00000000
--- a/modules/local/modified_minimap2.nf
+++ /dev/null
@@ -1,47 +0,0 @@
-process MODIFIED_MINIMAP2 {
-    tag "$meta.id"
-    label 'process_medium'
-
-    // Note: the versions here need to match the versions used in the mulled container below and minimap2/index
-    conda "bioconda::minimap2=2.24 bioconda::samtools=1.14"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' :
-        'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }"
-
-    input:
-    tuple val(meta), path(reads), path(reference)
-    val bam_format
-    val cigar_paf_format
-    val cigar_bam
-
-    output:
-    tuple val(meta), path("*.paf"), optional: true, emit: paf
-    tuple val(meta), path("*.bam"), optional: true, emit: bam
-    path "versions.yml"           , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    def bam_output = bam_format ? "-a | samtools view -@ ${task.cpus} -T ${reference} -F 2308 -bS - | samtools sort -@ ${task.cpus} -o ${prefix}.bam" : "-o ${prefix}.paf"
-    def cigar_paf = cigar_paf_format && !bam_format ? "-c" : ''
-    def set_cigar_bam = cigar_bam && bam_format ? "-L" : ''
-    """
-    minimap2 \\
-        $args \\
-        -t $task.cpus \\
-        "${reference ?: reads}" \\
-        "$reads" \\
-        $cigar_paf \\
-        $set_cigar_bam \\
-        $bam_output
-
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        minimap2: \$(minimap2 --version 2>&1)
-    END_VERSIONS
-    """
-}
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index d9cdacf6..f4869d43 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -52,7 +52,6 @@ include { CLUSTERING_SUMMARY                              } from '../modules/loc
 include { ALIGNMENT_SUMMARY                               } from '../modules/local/alignment_summary'
 include { TEMPLATE_REFERENCE                              } from '../modules/local/template_reference'
 include { DUMMY_FINAL_UMI                                 } from '../modules/local/dummy_final_umi'
-include { MODIFIED_MINIMAP2 as MINIMAP2_ALIGN_CONSENSUS_1 } from '../modules/local/modified_minimap2'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -83,9 +82,9 @@ include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2        } from '../modules/nf-co
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE     } from '../modules/nf-core/minimap2/align/main'
 include { SAMTOOLS_FAIDX                                } from '../modules/nf-core/samtools/faidx/main'
 include { MINIMAP2_INDEX                                } from '../modules/nf-core/minimap2/index/main'
+include { MEDAKA as MEDAKA_1                            } from '../modules/nf-core/medaka/main'
 include { CUTADAPT                                      } from '../modules/nf-core/cutadapt/main'
-include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_CONSENSUS_1  } from '../modules/nf-core/samtools/index/main'
-include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_ALIGNMENT    } from '../modules/nf-core/samtools/index/main'
+include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
 
 
 /*
@@ -482,44 +481,11 @@ workflow CRISPRSEQ {
 
 
     //
-    // MODULE: Convert reference .fa to .fai
+    // MODULE: Obtain a consensus sequence
     //
-    SAMTOOLS_FAIDX (
-        RACON_2.out.improved_assembly
-    )
-
-
-    //
-    // MODULE: Indexing the reference file
-    //
-    MINIMAP2_INDEX (
-        RACON_2.out.improved_assembly
-    )
-
-    //
-    // MODULE: Alignment to obtain the consensus of an UMI cluster
-    //
-    MINIMAP2_ALIGN_CONSENSUS_1 {
-        ch_clusters_sequence
-            .join(MINIMAP2_INDEX.out.index),
-        true,
-        false,
-        true
-    }
+    MEDAKA_1 (
 
-    //
-    // MODULE: Obtain .bam.bai files
-    //
-    SAMTOOLS_INDEX_CONSENSUS_1 (
-        ch_mapped_bam
     )
-    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_CONSENSUS_1.out.versions)
-
-
-    //
-    // MODULE:
-    //
-    
 
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
@@ -610,10 +576,10 @@ workflow CRISPRSEQ {
     //
     // MODULE: Obtain .bam.bai files
     //
-    SAMTOOLS_INDEX_ALIGNMENT (
+    SAMTOOLS_INDEX (
         ch_mapped_bam
     )
-    ch_versions = ch_versions.mix(SAMTOOLS_INDEX_ALIGNMENT.out.versions)
+    ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions)
 
     //
     // MODULE: Obtain a new reference with the template modification
@@ -649,7 +615,7 @@ workflow CRISPRSEQ {
     ch_versions = ch_versions.mix(MINIMAP2_ALIGN_TEMPLATE.out.versions)
 
     ch_mapped_bam
-        .join(SAMTOOLS_INDEX_ALIGNMENT.out.bai)
+        .join(SAMTOOLS_INDEX.out.bai)
         .join(ORIENT_REFERENCE.out.reference)
         .join(INPUT_CHECK.out.protospacer
             .map {

From de43d353abbb9d8467c958ab8254a3e35500884b Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <qeami01@rddsws.am10.uni-tuebingen.de>
Date: Tue, 9 May 2023 11:12:52 +0200
Subject: [PATCH 24/45] add medaka process

---
 conf/modules.config    | 5 +++++
 nextflow.config        | 1 +
 nextflow_schema.json   | 6 ++++++
 workflows/crisprseq.nf | 7 ++++---
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 94c79519..c9078b2e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -145,6 +145,11 @@ process {
         ext.prefix = { "${reads.baseName}_cycle2" }
     }
 
+    withName: MEDAKA {
+        ext.args = '-m r941_min_high_g303'
+        ext.prefix = { "${reads.baseName}_consensus" }
+    }
+
     withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
diff --git a/nextflow.config b/nextflow.config
index 4af7c0b1..cd61a1fa 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -15,6 +15,7 @@ params {
 
     // UMI parameters
     umi_bin_size               = 1
+    medaka_model               = 'r941_min_high_g303'
 
     // Vsearch options
     vsearch_minseqlength       = 55
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 74b49103..260f215e 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -53,6 +53,12 @@
                     "default": 1,
                     "description": "Minimum size of a UMI cluster.",
                     "fa_icon": "fas fa-sort-amount-down"
+                },
+                "medaka_model": {
+                    "type": "string",
+                    "default": "r941_min_high_g360",
+                    "fa_icon": "fas fa-font",
+                    "description": "Medaka model (-m) to use according to the basecaller used."
                 }
             },
             "fa_icon": "fas fa-layer-group"
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index f4869d43..037f91a0 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -82,7 +82,7 @@ include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2        } from '../modules/nf-co
 include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE     } from '../modules/nf-core/minimap2/align/main'
 include { SAMTOOLS_FAIDX                                } from '../modules/nf-core/samtools/faidx/main'
 include { MINIMAP2_INDEX                                } from '../modules/nf-core/minimap2/index/main'
-include { MEDAKA as MEDAKA_1                            } from '../modules/nf-core/medaka/main'
+include { MEDAKA                                        } from '../modules/nf-core/medaka/main'
 include { CUTADAPT                                      } from '../modules/nf-core/cutadapt/main'
 include { SAMTOOLS_INDEX                                } from '../modules/nf-core/samtools/index/main'
 
@@ -483,8 +483,9 @@ workflow CRISPRSEQ {
     //
     // MODULE: Obtain a consensus sequence
     //
-    MEDAKA_1 (
-
+    MEDAKA (
+        ch_clusters_sequence
+            .join(RACON_2.out.improved_assembly)
     )
 
     /*

From 353a8ba92e58793cefb6b1306a12ff8feebbdef9 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Tue, 9 May 2023 11:28:48 +0200
Subject: [PATCH 25/45] join all umi consensus in a file

---
 workflows/crisprseq.nf | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 037f91a0..b5b8a527 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -488,14 +488,30 @@ workflow CRISPRSEQ {
             .join(RACON_2.out.improved_assembly)
     )
 
+    // Collect all consensus UMI sequences into one single file per sample
+    MEDAKA.out.assembly
+    .tap{ meta_channel_4 }
+    .collectFile() { meta, file ->
+        [ "${meta.id}_consensus.fasta", file ]
+    }
+    .join(meta_channel_4
+        .map{ meta, consensus ->
+            ["${consensus.baseName}", meta]
+        }
+    )
+    .map{ name, file, meta ->
+        [meta - meta.subMap('cluster_id'), file]
+    }
+    .set{ ch_umi_consensus }
+
+    ch_umi_consensus.view()
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 
 
     Modules to implement:
 
-    consensus
-    join_reads
     fa2fq
 
     */

From ac9f2674e4f3adc979c9c2e1ac19026ccfdd4da2 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Tue, 9 May 2023 13:27:46 +0200
Subject: [PATCH 26/45] convert fasta to fastq after umi consensus

---
 conf/modules.config    |  4 ++++
 nextflow.config        |  1 +
 workflows/crisprseq.nf | 28 ++++++++++++++++++----------
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index c9078b2e..3b461fae 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -150,6 +150,10 @@ process {
         ext.prefix = { "${reads.baseName}_consensus" }
     }
 
+    withName: SEQTK_SEQ_FATOFQ {
+        ext.args = '-F "#"'
+    }
+
     withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
diff --git a/nextflow.config b/nextflow.config
index cd61a1fa..e1bdf5d4 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -108,6 +108,7 @@ profiles {
     docker {
         docker.enabled         = true
         docker.userEmulation   = true
+        docker.registry        = 'quay.io'
         singularity.enabled    = false
         podman.enabled         = false
         shifter.enabled        = false
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index b5b8a527..a874058e 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -67,7 +67,8 @@ include { MULTIQC                                       } from '../modules/nf-co
 include { CUSTOM_DUMPSOFTWAREVERSIONS                   } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 include { PEAR                                          } from '../modules/nf-core/pear/main'
 include { CAT_FASTQ                                     } from '../modules/nf-core/cat/fastq/main'
-include { SEQTK_SEQ                                     } from '../modules/nf-core/seqtk/seq/main'
+include { SEQTK_SEQ as SEQTK_SEQ_MASK                   } from '../modules/nf-core/seqtk/seq/main'
+include { SEQTK_SEQ as SEQTK_SEQ_FATOFQ                 } from '../modules/nf-core/seqtk/seq/main'
 include { VSEARCH_CLUSTER                               } from '../modules/nf-core/vsearch/cluster/main'
 include { VSEARCH_SORT                                  } from '../modules/nf-core/vsearch/sort/main'
 include { RACON as RACON_1                              } from '../modules/nf-core/racon/main'
@@ -288,10 +289,10 @@ workflow CRISPRSEQ {
     //
     // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80
     //
-    SEQTK_SEQ (
+    SEQTK_SEQ_MASK (
         ch_trimmed
     )
-    ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions)
+    ch_versions = ch_versions.mix(SEQTK_SEQ_MASK.out.versions)
 
 
     //
@@ -301,7 +302,7 @@ workflow CRISPRSEQ {
         ch_cat_fastq.paired
             .mix(ch_cat_fastq.single)
             .join(PEAR.out.assembled, remainder: true)
-            .join(SEQTK_SEQ.out.fastx)
+            .join(SEQTK_SEQ_MASK.out.fastx)
             .join(CUTADAPT.out.log)
             .map { meta, reads, assembled, masked, trimmed ->
                 if (assembled == null) {
@@ -318,7 +319,7 @@ workflow CRISPRSEQ {
     // MODULE: Extract UMI sequences
     //
     EXTRACT_UMIS (
-        SEQTK_SEQ.out.fastx
+        SEQTK_SEQ_MASK.out.fastx
     )
 
 
@@ -506,6 +507,13 @@ workflow CRISPRSEQ {
 
     ch_umi_consensus.view()
 
+    //
+    // MODULE: Convert fasta to fastq
+    //
+    SEQTK_SEQ_FATOFQ (
+        ch_umi_consensus
+    )
+
     /*
     The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
 
@@ -518,14 +526,14 @@ workflow CRISPRSEQ {
 
     // Dummy process simulating the last UMi clustering step to obtain clusters as fastq
     DUMMY_FINAL_UMI {
-        SEQTK_SEQ.out.fastx
+        SEQTK_SEQ_MASK.out.fastx
     }
 
     //
     // MODULE: Summary of clustered reads
     //
     CLUSTERING_SUMMARY (
-        SEQTK_SEQ.out.fastx
+        SEQTK_SEQ_MASK.out.fastx
             .join(MERGING_SUMMARY.out.summary)
     )
 
@@ -535,7 +543,7 @@ workflow CRISPRSEQ {
     //
     if (params.aligner == "minimap2") {
         MINIMAP2_ALIGN (
-            SEQTK_SEQ.out.fastx
+            SEQTK_SEQ_MASK.out.fastx
                 .join(ORIENT_REFERENCE.out.reference),
             true,
             false,
@@ -554,7 +562,7 @@ workflow CRISPRSEQ {
         )
         ch_versions = ch_versions.mix(BWA_INDEX.out.versions)
         BWA_MEM (
-            SEQTK_SEQ.out.fastx,
+            SEQTK_SEQ_MASK.out.fastx,
             BWA_INDEX.out.index,
             true
         )
@@ -571,7 +579,7 @@ workflow CRISPRSEQ {
         )
         ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions)
         BOWTIE2_ALIGN (
-            SEQTK_SEQ.out.fastx,
+            SEQTK_SEQ_MASK.out.fastx,
             BOWTIE2_BUILD.out.index,
             false,
             true

From be105b4315ea6ac1af3b2c6ffbec7450cafb7a68 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Tue, 9 May 2023 13:29:17 +0200
Subject: [PATCH 27/45] remove dummi_final_umi process

---
 modules/local/dummy_final_umi.nf | 55 --------------------------------
 workflows/crisprseq.nf           | 15 ---------
 2 files changed, 70 deletions(-)
 delete mode 100644 modules/local/dummy_final_umi.nf

diff --git a/modules/local/dummy_final_umi.nf b/modules/local/dummy_final_umi.nf
deleted file mode 100644
index 64069410..00000000
--- a/modules/local/dummy_final_umi.nf
+++ /dev/null
@@ -1,55 +0,0 @@
-process DUMMY_FINAL_UMI {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "conda-forge::p7zip==16.02"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/p7zip:16.02' :
-        'quay.io/biocontainers/p7zip:16.02' }"
-
-    input:
-    tuple val(meta), path(reads)
-
-    output:
-    tuple val(meta), path("*_clustered.fastq.gz"), emit: dummy
-    path "versions.yml"                       , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    touch ${prefix}_clustered.fastq
-
-    7za \\
-        a \\
-        $args \\
-        ${prefix}_clustered.fastq.gz \\
-        ${prefix}_clustered.fastq
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//')
-    END_VERSIONS
-    """
-}
-
-
-
-process summary_reads{
-
-    input:
-    set sampleID, file(summary), file(finalReads) from summaryReport.join(readsSummary)
-
-    output:
-    set val(sampleID), file(summary) into summaryReportReads
-
-    script:
-    """
-    final_count=`expr \$(cat $finalReads | wc -l) / 4`
-    echo "clustered-reads," \$final_count >> ${sampleID}_summary.csv
-    """
-
-    }
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index a874058e..94c3505e 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -51,7 +51,6 @@ include { MERGING_SUMMARY                                 } from '../modules/loc
 include { CLUSTERING_SUMMARY                              } from '../modules/local/clustering_summary'
 include { ALIGNMENT_SUMMARY                               } from '../modules/local/alignment_summary'
 include { TEMPLATE_REFERENCE                              } from '../modules/local/template_reference'
-include { DUMMY_FINAL_UMI                                 } from '../modules/local/dummy_final_umi'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -514,20 +513,6 @@ workflow CRISPRSEQ {
         ch_umi_consensus
     )
 
-    /*
-    The UMI clustering step is posponed until the next release, the steps to be implemented are listed below:
-
-
-    Modules to implement:
-
-    fa2fq
-
-    */
-
-    // Dummy process simulating the last UMi clustering step to obtain clusters as fastq
-    DUMMY_FINAL_UMI {
-        SEQTK_SEQ_MASK.out.fastx
-    }
 
     //
     // MODULE: Summary of clustered reads

From 798b4dbdad68cbc9ecd0e823e4215f56ba4653aa Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Tue, 9 May 2023 13:34:13 +0200
Subject: [PATCH 28/45] pass final umi consensus tu clustering_summary

---
 workflows/crisprseq.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 94c3505e..7a603bee 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -518,7 +518,7 @@ workflow CRISPRSEQ {
     // MODULE: Summary of clustered reads
     //
     CLUSTERING_SUMMARY (
-        SEQTK_SEQ_MASK.out.fastx
+        SEQTK_SEQ_FATOFQ.out.fastx
             .join(MERGING_SUMMARY.out.summary)
     )
 

From 8d10395f3c16090de7852cddee4f41f215e444b3 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Tue, 9 May 2023 13:35:03 +0200
Subject: [PATCH 29/45] pass final umi consensus to mapping steps

---
 workflows/crisprseq.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 7a603bee..c59be2f0 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -528,7 +528,7 @@ workflow CRISPRSEQ {
     //
     if (params.aligner == "minimap2") {
         MINIMAP2_ALIGN (
-            SEQTK_SEQ_MASK.out.fastx
+            SEQTK_SEQ_FATOFQ.out.fastx
                 .join(ORIENT_REFERENCE.out.reference),
             true,
             false,
@@ -547,7 +547,7 @@ workflow CRISPRSEQ {
         )
         ch_versions = ch_versions.mix(BWA_INDEX.out.versions)
         BWA_MEM (
-            SEQTK_SEQ_MASK.out.fastx,
+            SEQTK_SEQ_FATOFQ.out.fastx,
             BWA_INDEX.out.index,
             true
         )
@@ -564,7 +564,7 @@ workflow CRISPRSEQ {
         )
         ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions)
         BOWTIE2_ALIGN (
-            SEQTK_SEQ_MASK.out.fastx,
+            SEQTK_SEQ_FATOFQ.out.fastx,
             BOWTIE2_BUILD.out.index,
             false,
             true

From 42302dd92307af0279876f25fcff39041ea2a23f Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Thu, 11 May 2023 14:21:07 +0200
Subject: [PATCH 30/45] unzip input medaka

---
 conf/modules.config                |  3 ---
 modules.json                       |  3 ++-
 modules/nf-core/medaka/main.nf     |  3 +++
 modules/nf-core/medaka/medaka.diff | 15 +++++++++++++++
 4 files changed, 20 insertions(+), 4 deletions(-)
 create mode 100644 modules/nf-core/medaka/medaka.diff

diff --git a/conf/modules.config b/conf/modules.config
index 3b461fae..478a8e39 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -152,9 +152,6 @@ process {
 
     withName: SEQTK_SEQ_FATOFQ {
         ext.args = '-F "#"'
-    }
-
-    withName: DUMMY_FINAL_UMI {
         publishDir = [
             path: { "${params.outdir}/preprocessing/UMI" },
             mode: params.publish_dir_mode,
diff --git a/modules.json b/modules.json
index 456ae16b..0b50ffed 100644
--- a/modules.json
+++ b/modules.json
@@ -49,7 +49,8 @@
                     "medaka": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/medaka/medaka.diff"
                     },
                     "minimap2/align": {
                         "branch": "master",
diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf
index d60915ee..a4eaa998 100644
--- a/modules/nf-core/medaka/main.nf
+++ b/modules/nf-core/medaka/main.nf
@@ -21,6 +21,9 @@ process MEDAKA {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
+    gzip -cdf $reads > ${reads.baseName}
+    gzip -cdf $assembly > ${assembly.baseName}
+
     medaka_consensus \\
         -t $task.cpus \\
         $args \\
diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff
new file mode 100644
index 00000000..1c9d16f9
--- /dev/null
+++ b/modules/nf-core/medaka/medaka.diff
@@ -0,0 +1,15 @@
+Changes in module 'nf-core/medaka'
+--- modules/nf-core/medaka/main.nf
++++ modules/nf-core/medaka/main.nf
+@@ -21,6 +21,9 @@
+     def args = task.ext.args ?: ''
+     def prefix = task.ext.prefix ?: "${meta.id}"
+     """
++    gzip -cdf $reads > ${reads.baseName}
++    gzip -cdf $assembly > ${assembly.baseName}
++
+     medaka_consensus \\
+         -t $task.cpus \\
+         $args \\
+
+************************************************************

From a9969062e3bc61ad6071707094d376f3ae79810e Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 15 May 2023 13:12:02 +0200
Subject: [PATCH 31/45] make removing overrepresented sequences optional

---
 nextflow.config        |  3 ++
 nextflow_schema.json   | 17 ++++++++++
 workflows/crisprseq.nf | 72 +++++++++++++++++++++++-------------------
 3 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index e1bdf5d4..db3154b2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,6 +13,9 @@ params {
     input                      = null
     aligner                    = 'minimap2'
 
+    // Pipeline steps
+    overrepresented            = false
+
     // UMI parameters
     umi_bin_size               = 1
     medaka_model               = 'r941_min_high_g303'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 260f215e..a0e469ea 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -5,6 +5,20 @@
     "description": "Pipeline for the analysis of crispr data",
     "type": "object",
     "definitions": {
+        "popeline_steps": {
+            "title": "Popeline steps",
+            "type": "object",
+            "description": "Alternative pipeline steps to include in the analysis.",
+            "default": "",
+            "properties": {
+                "overrepresented": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-sort-numeric-up-alt",
+                    "description": "Trim overrepresented sequences from reads (cutadapt)"
+                }
+            },
+            "fa_icon": "fas fa-shoe-prints"
+        },
         "input_output_options": {
             "title": "Input/output options",
             "type": "object",
@@ -340,6 +354,9 @@
         }
     },
     "allOf": [
+        {
+            "$ref": "#/definitions/popeline_steps"
+        },
         {
             "$ref": "#/definitions/input_output_options"
         },
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index c59be2f0..3a3a1baf 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -249,41 +249,49 @@ workflow CRISPRSEQ {
     )
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
-    //
-    // MODULE: Find overrepresented sequences
-    FIND_ADAPTERS (
-        FASTQC.out.zip
-    )
-    .adapters
-    .join(ch_pear_fastq)
-    .groupTuple(by: [0])
-    // Separate samples by containing overrepresented sequences or not
-    .branch {
-        meta, adapter_seqs, reads ->
-            no_adapters: adapter_seqs[0].size() == 0
-                return [ meta, reads[0] ]
-            adapters   : adapter_seqs[0].size() > 0
-                return [ meta, reads[0], adapter_seqs[0] ]
-    }
-    .set { ch_adapter_seqs }
+    ch_trimmed = Channel.empty()
 
+    if (params.overrepresented) {
+        //
+        // MODULE: Find overrepresented sequences
+        FIND_ADAPTERS (
+            FASTQC.out.zip
+        )
+        .adapters
+        .join(ch_pear_fastq)
+        .groupTuple(by: [0])
+        // Separate samples by containing overrepresented sequences or not
+        .branch {
+            meta, adapter_seqs, reads ->
+                no_adapters: adapter_seqs[0].size() == 0
+                    return [ meta, reads[0] ]
+                adapters   : adapter_seqs[0].size() > 0
+                    return [ meta, reads[0], adapter_seqs[0] ]
+        }
+        .set { ch_adapter_seqs }
 
-    //
-    // MODULE: Trim adapter sequences
-    //
-    CUTADAPT (
-        ch_adapter_seqs.adapters
-    )
-    ch_versions = ch_versions.mix(CUTADAPT.out.versions)
 
-    ch_adapter_seqs.no_adapters
-    .mix(CUTADAPT.out.reads)
-    .groupTuple(by: [0])
-    .map {
-        meta, fastq ->
-                return [ meta, fastq.flatten() ]
+        //
+        // MODULE: Trim adapter sequences
+        //
+        CUTADAPT (
+            ch_adapter_seqs.adapters
+        )
+        ch_versions = ch_versions.mix(CUTADAPT.out.versions)
+
+        ch_adapter_seqs.no_adapters
+        .mix(CUTADAPT.out.reads)
+        .groupTuple(by: [0])
+        .map {
+            meta, fastq ->
+                    return [ meta, fastq.flatten() ]
+        }
+        .set{ ch_trimmed }
+    } else {
+        ch_trimmed = ch_pear_fastq
     }
-    .set{ ch_trimmed }
+
+    ch_trimmed.view()
 
     //
     // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80
@@ -302,7 +310,7 @@ workflow CRISPRSEQ {
             .mix(ch_cat_fastq.single)
             .join(PEAR.out.assembled, remainder: true)
             .join(SEQTK_SEQ_MASK.out.fastx)
-            .join(CUTADAPT.out.log)
+            .join(( params.overrepresented ? CUTADAPT.out.log : Channel.value("null") ))
             .map { meta, reads, assembled, masked, trimmed ->
                 if (assembled == null) {
                     dummy = file('null')

From 2706b7aab4f0cbdee54ae75f9e6c2fe227534f74 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Mon, 15 May 2023 14:39:35 +0200
Subject: [PATCH 32/45] unzip medaka input

---
 modules/nf-core/medaka/main.nf     | 26 +++++++++++++------
 modules/nf-core/medaka/medaka.diff | 41 +++++++++++++++++++++++++++---
 workflows/crisprseq.nf             | 36 +++++++++++++-------------
 3 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf
index a4eaa998..b02119b7 100644
--- a/modules/nf-core/medaka/main.nf
+++ b/modules/nf-core/medaka/main.nf
@@ -11,8 +11,8 @@ process MEDAKA {
     tuple val(meta), path(reads), path(assembly)
 
     output:
-    tuple val(meta), path("*.fa.gz"), emit: assembly
-    path "versions.yml"             , emit: versions
+    tuple val(meta), path("*.fasta.gz"), emit: assembly
+    path "versions.yml"                , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -21,19 +21,29 @@ process MEDAKA {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    gzip -cdf $reads > ${reads.baseName}
-    gzip -cdf $assembly > ${assembly.baseName}
+    if [[ "${reads.extension}" == "gz" ]]; then
+        gzip -df $reads
+        reads=$reads.baseName
+    else
+        reads=$reads
+    fi
+    if [[ "${assembly.extension}" == "gz" ]]; then
+        gzip -df $assembly
+        assembly=$assembly.baseName
+    else
+        assembly=$assembly.baseName
+    fi
 
     medaka_consensus \\
         -t $task.cpus \\
         $args \\
-        -i $reads \\
-        -d $assembly \\
+        -i \$reads \\
+        -d \$assembly \\
         -o ./
 
-    mv consensus.fasta ${prefix}.fa
+    mv consensus.fasta ${prefix}.fasta
 
-    gzip -n ${prefix}.fa
+    gzip -n ${prefix}.fasta
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff
index 1c9d16f9..d0b2fb34 100644
--- a/modules/nf-core/medaka/medaka.diff
+++ b/modules/nf-core/medaka/medaka.diff
@@ -1,15 +1,50 @@
 Changes in module 'nf-core/medaka'
 --- modules/nf-core/medaka/main.nf
 +++ modules/nf-core/medaka/main.nf
-@@ -21,6 +21,9 @@
+@@ -11,8 +11,8 @@
+     tuple val(meta), path(reads), path(assembly)
+ 
+     output:
+-    tuple val(meta), path("*.fa.gz"), emit: assembly
+-    path "versions.yml"             , emit: versions
++    tuple val(meta), path("*.fasta.gz"), emit: assembly
++    path "versions.yml"                , emit: versions
+ 
+     when:
+     task.ext.when == null || task.ext.when
+@@ -21,16 +21,29 @@
      def args = task.ext.args ?: ''
      def prefix = task.ext.prefix ?: "${meta.id}"
      """
-+    gzip -cdf $reads > ${reads.baseName}
-+    gzip -cdf $assembly > ${assembly.baseName}
++    if [[ "${reads.extension}" == "gz" ]]; then
++        gzip -df $reads
++        reads=$reads.baseName
++    else
++        reads=$reads
++    fi
++    if [[ "${assembly.extension}" == "gz" ]]; then
++        gzip -df $assembly
++        assembly=$assembly.baseName
++    else
++        assembly=$assembly.baseName
++    fi
 +
      medaka_consensus \\
          -t $task.cpus \\
          $args \\
+-        -i $reads \\
+-        -d $assembly \\
++        -i \$reads \\
++        -d \$assembly \\
+         -o ./
+ 
+-    mv consensus.fasta ${prefix}.fa
++    mv consensus.fasta ${prefix}.fasta
+ 
+-    gzip -n ${prefix}.fa
++    gzip -n ${prefix}.fasta
+ 
+     cat <<-END_VERSIONS > versions.yml
+     "${task.process}":
 
 ************************************************************
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 3a3a1baf..ca2fd24c 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -291,7 +291,6 @@ workflow CRISPRSEQ {
         ch_trimmed = ch_pear_fastq
     }
 
-    ch_trimmed.view()
 
     //
     // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80
@@ -389,49 +388,50 @@ workflow CRISPRSEQ {
 
     // Get the correspondent fasta sequencences from top cluster sequences
     // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
-    VSEARCH_SORT.out.fasta
-    .tap{ meta_channel_2 }
+    VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta]
+    .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta]
     .map{ meta, fasta ->
         fasta_line = umi_to_sequence_centroid(fasta)
-        [meta, fasta.baseName, fasta_line]
+        [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...]
     }
     .collectFile() { meta, name, fasta ->
-        [ "${name}.fasta", fasta ]
+        [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta
     }
     .map{ new_file ->
-        [new_file.baseName[0..-5], new_file] // Substring is removing "_top" added by VSEARCH_SORT
+        [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta]
     }
     .join(meta_channel_2
         .map { meta, original_file ->
-            ["${original_file.baseName}"[0..-5], meta] // Substring is removing "_top" added by VSEARCH_SORT
-        })
+            ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]]
+        }) // [sample, sample_top.fasta, [id:sample_id, ...]]
     .map{ file_name, new_file, meta ->
-        [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map
+        [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
     }
-    .set{ ch_top_clusters_sequence }
+    .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
 
     // Get the correspondent fasta sequencences from UMI clusters
-    ch_umi_bysize.cluster
-    .tap{ meta_channel_3 }
+    ch_umi_bysize.cluster // [[id:sample_id, ...], sample]
+    .tap{ meta_channel_3 } // [[id:sample_id, ...], sample]
     .map{ meta, cluster ->
         fasta_line = umi_to_sequence(cluster)
-        [meta, cluster.baseName, fasta_line]
+        [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...]
     }
     .collectFile() { meta, name, fasta ->
-        [ "${name}_sequences.fasta", fasta ]
+        [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta
     }
     .map{ new_file ->
-        [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile
+        [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta]
     }
     .join(meta_channel_3
         .map { meta, original_file ->
-            ["${original_file.baseName}", meta]
-        })
+            ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]]
+        }) // [sample, sample_sequences.fasta, [id:sample_id, ...]]
     .map{ file_name, new_file, meta ->
-        [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map
+        [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta]
     }
     .set{ ch_clusters_sequence }
 
+
     // Cluster consensus & polishing
     // Two cycles of minimap2 + racon
 

From c0974bf45a81955744935dc7979ab7a61aa06b5e Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Mon, 15 May 2023 15:45:57 +0200
Subject: [PATCH 33/45] fix medaka error

---
 conf/modules.config                |  2 +-
 modules/nf-core/medaka/main.nf     |  8 +++-----
 modules/nf-core/medaka/medaka.diff | 13 ++++++-------
 workflows/crisprseq.nf             | 10 ++++++++--
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 478a8e39..fe53493a 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -147,7 +147,7 @@ process {
 
     withName: MEDAKA {
         ext.args = '-m r941_min_high_g303'
-        ext.prefix = { "${reads.baseName}_consensus" }
+        ext.prefix = { "${reads.baseName}_medakaConsensus" }
     }
 
     withName: SEQTK_SEQ_FATOFQ {
diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf
index b02119b7..80e4c065 100644
--- a/modules/nf-core/medaka/main.nf
+++ b/modules/nf-core/medaka/main.nf
@@ -11,8 +11,8 @@ process MEDAKA {
     tuple val(meta), path(reads), path(assembly)
 
     output:
-    tuple val(meta), path("*.fasta.gz"), emit: assembly
-    path "versions.yml"                , emit: versions
+    tuple val(meta), path("*_medakaConsensus.fasta"), emit: assembly
+    path "versions.yml"                             , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -31,7 +31,7 @@ process MEDAKA {
         gzip -df $assembly
         assembly=$assembly.baseName
     else
-        assembly=$assembly.baseName
+        assembly=$assembly
     fi
 
     medaka_consensus \\
@@ -43,8 +43,6 @@ process MEDAKA {
 
     mv consensus.fasta ${prefix}.fasta
 
-    gzip -n ${prefix}.fasta
-
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' )
diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff
index d0b2fb34..7f8752ac 100644
--- a/modules/nf-core/medaka/medaka.diff
+++ b/modules/nf-core/medaka/medaka.diff
@@ -7,12 +7,12 @@ Changes in module 'nf-core/medaka'
      output:
 -    tuple val(meta), path("*.fa.gz"), emit: assembly
 -    path "versions.yml"             , emit: versions
-+    tuple val(meta), path("*.fasta.gz"), emit: assembly
-+    path "versions.yml"                , emit: versions
++    tuple val(meta), path("*_medakaConsensus.fasta"), emit: assembly
++    path "versions.yml"                             , emit: versions
  
      when:
      task.ext.when == null || task.ext.when
-@@ -21,16 +21,29 @@
+@@ -21,16 +21,27 @@
      def args = task.ext.args ?: ''
      def prefix = task.ext.prefix ?: "${meta.id}"
      """
@@ -26,7 +26,7 @@ Changes in module 'nf-core/medaka'
 +        gzip -df $assembly
 +        assembly=$assembly.baseName
 +    else
-+        assembly=$assembly.baseName
++        assembly=$assembly
 +    fi
 +
      medaka_consensus \\
@@ -39,10 +39,9 @@ Changes in module 'nf-core/medaka'
          -o ./
  
 -    mv consensus.fasta ${prefix}.fa
-+    mv consensus.fasta ${prefix}.fasta
- 
+-
 -    gzip -n ${prefix}.fa
-+    gzip -n ${prefix}.fasta
++    mv consensus.fasta ${prefix}.fasta
  
      cat <<-END_VERSIONS > versions.yml
      "${task.process}":
diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index ca2fd24c..0c64cc95 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -499,12 +499,19 @@ workflow CRISPRSEQ {
     // Collect all consensus UMI sequences into one single file per sample
     MEDAKA.out.assembly
     .tap{ meta_channel_4 }
+    .map{ meta, file ->
+        file_content = file.getText()
+        [meta, file_content] // [[id:sample_id, ...], consensus_content]
+    }
     .collectFile() { meta, file ->
         [ "${meta.id}_consensus.fasta", file ]
     }
+    .map{ new_file ->
+        [new_file.baseName, new_file] 
+    }
     .join(meta_channel_4
         .map{ meta, consensus ->
-            ["${consensus.baseName}", meta]
+            ["${meta.id}_consensus", meta]
         }
     )
     .map{ name, file, meta ->
@@ -512,7 +519,6 @@ workflow CRISPRSEQ {
     }
     .set{ ch_umi_consensus }
 
-    ch_umi_consensus.view()
 
     //
     // MODULE: Convert fasta to fastq

From 565e6e89a5c1b91895d516ccfedf94e6fdb8ab41 Mon Sep 17 00:00:00 2001
From: Julia Mir Pedrol <mirp.julia@gmail.com>
Date: Mon, 22 May 2023 17:38:18 +0200
Subject: [PATCH 34/45] hopefully fix umis bug

---
 workflows/crisprseq.nf | 52 +++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf
index 0c64cc95..ca4f15bb 100644
--- a/workflows/crisprseq.nf
+++ b/workflows/crisprseq.nf
@@ -93,15 +93,17 @@ include { SAMTOOLS_INDEX                                } from '../modules/nf-co
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-def umi_to_sequence(cluster) {
-    cluster.withReader { source ->
-        String line
-        sequences = ""
-        while ( line=source.readLine() ) {
-            if (line.startsWith(">")) {
-                sequence = (line =~ /;seq=(.*$)/)[0][1]
-                id = (line =~ /(>.*?);/)[0][1]
-                sequences = sequences + id + "\n" + sequence + "\n"
+def umi_to_sequence(cluster1) {
+    String line1
+    String sequences = ""
+    String sequence1
+    String id1
+    cluster1.withReader {
+        while ( line1=it.readLine() ) {
+            if (line1.startsWith(">")) {
+                sequence1 = (line1 =~ /;seq=(.*$)/)[0][1]
+                id1 = (line1 =~ /(>.*?);/)[0][1]
+                sequences = sequences + id1 + "\n" + sequence1 + "\n"
             }
         }
     }
@@ -109,17 +111,18 @@ def umi_to_sequence(cluster) {
 }
 
 def umi_to_sequence_centroid(cluster) {
-    cluster.withReader { source ->
-        String line
-        while ( line=source.readLine() ) {
+    String line
+    String sequence
+    String id
+    cluster.withReader {
+        while ( line=it.readLine() ) {
             if (line.startsWith(">")) {
                 sequence = (line =~ /;seq=(.*$)/)[0][1]
                 id = (line =~ /(>.*?);/)[0][1]
+                return id.replace(">", ">centroid_") + "\n" + sequence
             }
         }
     }
-    id = id.replace(">", ">centroid_")
-    return id + "\n" + sequence
 }
 
 
@@ -386,6 +389,9 @@ workflow CRISPRSEQ {
         Channel.value("--sortbysize")
     )
 
+    //VSEARCH_SORT.out.fasta.dump(tag:'vsearch_sort_output')
+    //ch_umi_bysize.cluster.dump(tag:'umi_clusters')
+
     // Get the correspondent fasta sequencences from top cluster sequences
     // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
     VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta]
@@ -394,9 +400,10 @@ workflow CRISPRSEQ {
         fasta_line = umi_to_sequence_centroid(fasta)
         [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...]
     }
-    .collectFile() { meta, name, fasta ->
+    .collectFile(cache:true,sort:true) { meta, name, fasta ->
         [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta
     }
+    .dump(tag:"collectfile output")
     .map{ new_file ->
         [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta]
     }
@@ -408,6 +415,7 @@ workflow CRISPRSEQ {
         [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
     }
     .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
+    
 
     // Get the correspondent fasta sequencences from UMI clusters
     ch_umi_bysize.cluster // [[id:sample_id, ...], sample]
@@ -416,9 +424,11 @@ workflow CRISPRSEQ {
         fasta_line = umi_to_sequence(cluster)
         [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...]
     }
-    .collectFile() { meta, name, fasta ->
+    .dump(tag:'map allreads output')
+    .collectFile(cache:true,sort:true) { meta, name, fasta ->
         [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta
     }
+    .dump(tag:"collectfile allreads output")
     .map{ new_file ->
         [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta]
     }
@@ -447,11 +457,21 @@ workflow CRISPRSEQ {
         false
     )
 
+    MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered')
+
     // Only continue with clusters that have aligned sequences
     MINIMAP2_ALIGN_UMI_1.out.paf
         .filter{ it[1].countLines() > 0 }
         .set{ ch_minimap_1 }
 
+    //ch_clusters_sequence.dump(tag:'ch_umi_clusters')
+    //ch_top_clusters_sequence.dump(tag:'ch_vsearch_top')
+    ch_minimap_1.dump(tag:'minimap')
+    //ch_clusters_sequence
+    //    .join(ch_top_clusters_sequence)
+    //    .join(ch_minimap_1)
+    //    .dump(tag:'clusters_top_alignment')
+    
     //
     // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1
     //

From 6c57e3eb69d8978f1ad62eefd3b1f72dd69555b3 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 12 Jun 2023 09:09:24 +0200
Subject: [PATCH 35/45] fix typo

---
 nextflow_schema.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6a8265bb..a56a38f2 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -5,8 +5,8 @@
     "description": "Pipeline for the analysis of crispr data",
     "type": "object",
     "definitions": {
-        "popeline_steps": {
-            "title": "Popeline steps",
+        "pipeline_steps": {
+            "title": "pipeline steps",
             "type": "object",
             "description": "Alternative pipeline steps to include in the analysis.",
             "default": "",
@@ -401,7 +401,7 @@
             "$ref": "#/definitions/input_output_options"
         },
         {
-            "$ref": "#/definitions/popeline_steps"
+            "$ref": "#/definitions/pipeline_steps"
         },
         {
             "$ref": "#/definitions/screening_parameters"

From 6dfba880712f8b87687cfc0eac8550669132f3a5 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 12 Jun 2023 09:12:25 +0200
Subject: [PATCH 36/45] update schema

---
 nextflow_schema.json | 51 +++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index a56a38f2..27b9911c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -5,20 +5,6 @@
     "description": "Pipeline for the analysis of crispr data",
     "type": "object",
     "definitions": {
-        "pipeline_steps": {
-            "title": "pipeline steps",
-            "type": "object",
-            "description": "Alternative pipeline steps to include in the analysis.",
-            "default": "",
-            "properties": {
-                "overrepresented": {
-                    "type": "boolean",
-                    "fa_icon": "fas fa-sort-numeric-up-alt",
-                    "description": "Trim overrepresented sequences from reads (cutadapt)"
-                }
-            },
-            "fa_icon": "fas fa-shoe-prints"
-        },
         "input_output_options": {
             "title": "Input/output options",
             "type": "object",
@@ -45,7 +31,8 @@
                 "analysis": {
                     "type": "string",
                     "fa_icon": "fas fa-cog",
-                    "enum": ["screening", "targeted"]
+                    "enum": ["screening", "targeted"],
+                    "description": "Type of analysis to perform"
                 },
                 "email": {
                     "type": "string",
@@ -61,6 +48,20 @@
                 }
             }
         },
+        "targeted_pipeline_steps": {
+            "title": "targeted pipeline steps",
+            "type": "object",
+            "description": "Alternative pipeline steps to include in the targeted analysis.",
+            "default": "",
+            "properties": {
+                "overrepresented": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-sort-numeric-up-alt",
+                    "description": "Trim overrepresented sequences from reads (cutadapt)"
+                }
+            },
+            "fa_icon": "fas fa-shoe-prints"
+        },
         "umi_parameters": {
             "title": "UMI parameters",
             "type": "object",
@@ -162,12 +163,12 @@
                 "min_reads": {
                     "type": "number",
                     "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample",
-                    "default": 30.0
+                    "default": 30
                 },
                 "min_targeted_genes": {
                     "type": "number",
                     "description": "Minimal number of different genes targeted by sgRNAs in a biased segment in order for the corresponding counts to be corrected",
-                    "default": 3.0
+                    "default": 3
                 }
             }
         },
@@ -401,10 +402,7 @@
             "$ref": "#/definitions/input_output_options"
         },
         {
-            "$ref": "#/definitions/pipeline_steps"
-        },
-        {
-            "$ref": "#/definitions/screening_parameters"
+            "$ref": "#/definitions/targeted_pipeline_steps"
         },
         {
             "$ref": "#/definitions/umi_parameters"
@@ -415,6 +413,9 @@
         {
             "$ref": "#/definitions/vsearch_parameters"
         },
+        {
+            "$ref": "#/definitions/screening_parameters"
+        },
         {
             "$ref": "#/definitions/reference_genome_options"
         },
@@ -427,5 +428,11 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ]
+    ],
+    "properties": {
+        "schema_ignore_params": {
+            "type": "string",
+            "default": "genomes"
+        }
+    }
 }

From 4ed1882857aa389b187e1547b7e03921dc9e3925 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 12 Jun 2023 09:14:48 +0200
Subject: [PATCH 37/45] add analysis param in test_umis

---
 conf/test_umis.config | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conf/test_umis.config b/conf/test_umis.config
index 42eab8f8..cfa11b55 100644
--- a/conf/test_umis.config
+++ b/conf/test_umis.config
@@ -20,7 +20,8 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv'
+    input     = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv'
+    analysis = 'targeted'
 
     // Aligner
     aligner = 'minimap2'

From 0482c066d1314ebbebaede0f5ff6d3c588a2c4d8 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Mon, 12 Jun 2023 14:00:31 +0200
Subject: [PATCH 38/45] add docs for UMIs

---
 README.md                       | 18 +++++++--
 conf/modules.config             | 10 +++++
 docs/output_targeted.md         | 69 +++++++++++++++++++++++++++++----
 docs/usage_targeted.md          |  4 +-
 workflows/crisprseq_targeted.nf | 11 ------
 5 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index ae00a3f4..8c46e6c6 100644
--- a/README.md
+++ b/README.md
@@ -32,11 +32,19 @@ For crispr targeted :
 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 3. Adapter trimming ([`Cutadapt`](http://dx.doi.org/10.14806/ej.17.1.200))
 4. Quality filtering ([`Seqtk`](https://github.com/lh3/seqtk))
-5. Read mapping:
+5. UMI clustering (optional):
+  1. Extract UMI sequences (Python script)
+  2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch))
+  3. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch))
+  4. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2))
+  5. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon))
+  6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`)
+  7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html))
+6. Read mapping:
    - ([`minimap2`](https://github.com/lh3/minimap2), _default_)
    - ([`bwa`](http://bio-bwa.sourceforge.net/))
    - ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml))
-6. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/))
+7. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/))
 
 For crispr screening :
 
@@ -56,7 +64,9 @@ For crispr screening :
 3. Download the pipeline and test it on a minimal dataset with a single command:
 
    ```bash
-   nextflow run nf-core/crisprseq -profile test,YOURPROFILE --outdir <OUTDIR>
+   nextflow run nf-core/crisprseq -profile test_screening,YOURPROFILE --outdir <OUTDIR>
+   # or
+   nextflow run nf-core/crisprseq -profile test_targeted,YOURPROFILE --outdir <OUTDIR>
    ```
 
    Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
@@ -69,7 +79,7 @@ For crispr screening :
 4. Start running your own analysis!
 
    ```bash
-   nextflow run nf-core/crisprseq --input samplesheet.csv --outdir <OUTDIR> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+   nextflow run nf-core/crisprseq --input samplesheet.csv --analysis <targeted/screening> --outdir <OUTDIR> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
    ```
 
 ## Documentation
diff --git a/conf/modules.config b/conf/modules.config
index e28805c2..50a27953 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -162,11 +162,21 @@ process {
     withName: MINIMAP2_ALIGN_UMI_1 {
         ext.args = '-x map-ont'
         ext.prefix = { "${reads.baseName}_cycle1" }
+        publishDir = [
+            path: { "${params.outdir}/minimap2_umi" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
     }
 
     withName: MINIMAP2_ALIGN_UMI_2 {
         ext.args = '-x map-ont'
         ext.prefix = { "${reads.baseName}_cycle2" }
+        publishDir = [
+            path: { "${params.outdir}/minimap2_umi" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
     }
 
     withName: RACON_1 {
diff --git a/docs/output_targeted.md b/docs/output_targeted.md
index 1cb835c1..c84aa887 100644
--- a/docs/output_targeted.md
+++ b/docs/output_targeted.md
@@ -11,14 +11,18 @@ The directories listed below will be created in the results directory after the
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
 - [Preprocessing](#preprocessing)
-  - [Sequences](#sequences) - Input sequence preparation (reference, protospacer, template)
+  - [sequences](#sequences) - Input sequence preparation (reference, protospacer, template)
   - [cat](#cat) - Concatenate sample fastq files if required
-  - [Pear](#pear) - Join double-end reads if required
-  - [FastQC](#fastqc) - Read Quality Control
-  - [Adapters](#adapters) - Find adapters (Overrepresented sequences) in reads
-  - [Cutadapt](#cutadapt) - Trim adapters
-  - [Seqtk](#seqtk) - Mask low-quality bases
-  <!-- -UMI(#umi) -->
+  - [pear](#pear) - Join double-end reads if required
+  - [fastqc](#fastqc) - Read Quality Control
+  - [adapters](#adapters) - Find adapters (Overrepresented sequences) in reads
+  - [cutadapt](#cutadapt) - Trim adapters
+  - [seqtk](#seqtk) - Mask low-quality bases
+- [UMI clustering](#umi-clustering)
+  - [vsearch](#vsearch)
+  - [minimap2 umi](#minimap2-umi)
+  - [racon](#racon)
+  - [medaka](#medaka)
 - [Mapping](#mapping)
   - [minimap2](#minimap2) - Mapping reads to reference
   - [BWA](#bwa) - Mapping reads to reference
@@ -133,7 +137,56 @@ If multiple libraries/runs have been provided for the same sample in the input s
 
 [Seqtk](https://github.com/lh3/seqtk) masks (converts to Ns) bases with quality lower than 20 and removes sequences shorter than 80 bases.
 
-<!-- ### UMI -->
+## UMI clustering
+
+### vsearch
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `vsearch/`
+  - `*_clusters*`: Contains all UMI sequences which clustered together.
+  - `*_clusters*_top.fasta`: Contains the most abundant UMI sequence from the cluster.
+
+</details>
+
+[VSEARCH](https://github.com/torognes/vsearch) is a versatile open-source tool which includes chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. `vsearch/clsuter` can cluster sequences using a single-pass, greedy centroid-based clustering algorithm. `vsearch/sort` can sort fasta entries by decreasing abundance (`--sortbysize`) or sequence length (`--sortbylength`).
+
+### minimap2_umi
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `minimap2_umi/`
+  - `*_sequences_clycle[1,2].paf`: Alignment of the cluster sequences against the top UMi sequence in paf format.
+
+</details>
+
+[Minimap2](https://github.com/lh3/minimap2) is a sequence alignment program that aligns DNA sequences against a reference database.
+
+### racon
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `racon/`
+  - `*_sequences_clycle[1,2]_assembly_consensus.fasta.gz`: Consensus sequence obtained from the cluster multiple sequence alignment.
+
+</details>
+
+[Racon](https://github.com/lbcb-sci/racon) is an ultrafast consensus module for raw de novo genome assembly of long uncorrected reads.
+
+### medaka
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `medaka/`
+  - `*_medakaConsensus.fasta`: Final consensus sequence of each UMI cluster. Obtained after two rounds of minimap2 + racon.
+
+</details>
+
+[Medaka](https://nanoporetech.github.io/medaka/index.html) is a tool to create consensus sequences and variant calls from nanopore sequencing data.
 
 ## Mapping
 
diff --git a/docs/usage_targeted.md b/docs/usage_targeted.md
index f6af7ff1..8f9d759d 100644
--- a/docs/usage_targeted.md
+++ b/docs/usage_targeted.md
@@ -53,10 +53,10 @@ An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv)
 
 ## Running the pipeline
 
-The typical command for running the pipeline is as follows:
+The typical command for running the pipeline for targeted CIRSPR analysis is as follows:
 
 ```bash
-nextflow run nf-core/crisprseq --input samplesheet.csv --outdir <OUTDIR> -profile docker
+nextflow run nf-core/crisprseq --input samplesheet.csv --analysis targeted --outdir <OUTDIR> -profile docker
 ```
 
 This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index a81797b7..ba068d8b 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -389,9 +389,6 @@ workflow CRISPRSEQ_TARGETED {
         Channel.value("--sortbysize")
     )
 
-    //VSEARCH_SORT.out.fasta.dump(tag:'vsearch_sort_output')
-    //ch_umi_bysize.cluster.dump(tag:'umi_clusters')
-
     // Get the correspondent fasta sequencences from top cluster sequences
     // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
     VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta]
@@ -464,14 +461,6 @@ workflow CRISPRSEQ_TARGETED {
         .filter{ it[1].countLines() > 0 }
         .set{ ch_minimap_1 }
 
-    //ch_clusters_sequence.dump(tag:'ch_umi_clusters')
-    //ch_top_clusters_sequence.dump(tag:'ch_vsearch_top')
-    ch_minimap_1.dump(tag:'minimap')
-    //ch_clusters_sequence
-    //    .join(ch_top_clusters_sequence)
-    //    .join(ch_minimap_1)
-    //    .dump(tag:'clusters_top_alignment')
-
     //
     // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1
     //

From 4ab6d0b6d7781abd51cc60fa6935c62774b07814 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= <mirp.julia@gmail.com>
Date: Tue, 27 Jun 2023 15:38:58 +0200
Subject: [PATCH 39/45] Apply suggestions from code review

---
 CHANGELOG.md                    | 2 +-
 workflows/crisprseq_targeted.nf | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4bf11ddc..2e8564c1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Add UMI clustering to crisprseq-edition ([#24](https://github.com/nf-core/crisprseq/pull/24))
+- Add UMI clustering to crisprseq-targeted ([#24](https://github.com/nf-core/crisprseq/pull/24))
 
 ### Fixed
 
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index ba068d8b..5265168e 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -678,9 +678,9 @@ workflow CRISPRSEQ_TARGETED {
     //
     // MODULE: Parse cigar to find edits
     //
-    //CIGAR_PARSER (
-    //    ch_to_parse_cigar
-    //)
+    CIGAR_PARSER (
+        ch_to_parse_cigar
+    )
 
 
     //

From ae81ac0e873a9c146b2251a9fd48eeb58d5e7674 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 27 Jun 2023 15:53:29 +0200
Subject: [PATCH 40/45] add --umi_clustering parameter to opt-in the umi
 clustering steps

---
 nextflow.config                 |   1 +
 nextflow_schema.json            |  20 +-
 workflows/crisprseq_targeted.nf | 387 ++++++++++++++++----------------
 3 files changed, 209 insertions(+), 199 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 77899b0e..952b570d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -23,6 +23,7 @@ params {
 
     // Pipeline steps
     overrepresented            = false
+    umi_clustering             = false
 
     // UMI parameters
     umi_bin_size               = 1
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 1cc44ce2..31256db7 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -58,6 +58,11 @@
                     "type": "boolean",
                     "fa_icon": "fas fa-sort-numeric-up-alt",
                     "description": "Trim overrepresented sequences from reads (cutadapt)"
+                },
+                "umi_clustering": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-layer-group",
+                    "description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps."
                 }
             },
             "fa_icon": "fas fa-shoe-prints"
@@ -393,6 +398,13 @@
                     "description": "Show all params when using `--help`",
                     "hidden": true,
                     "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters."
+                },
+                "schema_ignore_params": {
+                    "type": "string",
+                    "default": "genomes",
+                    "description": "Ignore JSON schema validation of the following params",
+                    "fa_icon": "fas fa-ban",
+                    "hidden": true
                 }
             }
         }
@@ -428,11 +440,5 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ],
-    "properties": {
-        "schema_ignore_params": {
-            "type": "string",
-            "default": "genomes"
-        }
-    }
+    ]
 }
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 5265168e..b7f0e025 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -324,224 +324,227 @@ workflow CRISPRSEQ_TARGETED {
     }
 
 
-    //
-    // MODULE: Extract UMI sequences
-    //
-    EXTRACT_UMIS (
-        SEQTK_SEQ_MASK.out.fastx
-    )
+    if (params.umi_clustering) {
+        //
+        // MODULE: Extract UMI sequences
+        //
+        EXTRACT_UMIS (
+            SEQTK_SEQ_MASK.out.fastx
+        )
 
 
-    //
-    // MODULE: Cluster UMIs
-    //
-    VSEARCH_CLUSTER (
-        EXTRACT_UMIS.out.fasta
-    )
+        //
+        // MODULE: Cluster UMIs
+        //
+        VSEARCH_CLUSTER (
+            EXTRACT_UMIS.out.fasta
+        )
 
-    //  Obtain a file with UBS (UBI bin size) and UMI ID
-    VSEARCH_CLUSTER.out.clusters
-    .transpose()
-    .collectFile( storeDir:params.outdir ) {
-        it ->
-            [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ]
-    }
+        //  Obtain a file with UBS (UBI bin size) and UMI ID
+        VSEARCH_CLUSTER.out.clusters
+        .transpose()
+        .collectFile( storeDir:params.outdir ) {
+            it ->
+                [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ]
+        }
 
-    // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences
-    VSEARCH_CLUSTER.out.clusters
-    .transpose()
-    .branch{
-        meta, cluster ->
-            single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1
-                return [meta, cluster]
-            cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1
-                return [meta, cluster]
-    }
-    .set{ ch_umi_bysize }
-
-    // Get the correspondent fasta sequencences from single clusters
-    ch_umi_bysize.single
-    .tap{ meta_channel }
-    .map{ meta, cluster ->
-        fasta_line = umi_to_sequence(cluster)
-        [meta, cluster.baseName, fasta_line]
-    }
-    .collectFile() { meta, name, fasta ->
-        [ "${name}_consensus.fasta", fasta ]
-    }
-    .map{ new_file ->
-        [new_file.baseName, new_file]
-    }
-    .join(meta_channel
-        .map { meta, original_file ->
-            ["${original_file.baseName}_consensus", meta]
-        })
-    .map{ file_name, new_file, meta ->
-        [meta, new_file]
-    }
-    .set{ ch_single_clusters_consensus }
+        // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences
+        VSEARCH_CLUSTER.out.clusters
+        .transpose()
+        .branch{
+            meta, cluster ->
+                single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1
+                    return [meta, cluster]
+                cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1
+                    return [meta, cluster]
+        }
+        .set{ ch_umi_bysize }
+
+        // Get the correspondent fasta sequencences from single clusters
+        ch_umi_bysize.single
+        .tap{ meta_channel }
+        .map{ meta, cluster ->
+            fasta_line = umi_to_sequence(cluster)
+            [meta, cluster.baseName, fasta_line]
+        }
+        .collectFile() { meta, name, fasta ->
+            [ "${name}_consensus.fasta", fasta ]
+        }
+        .map{ new_file ->
+            [new_file.baseName, new_file]
+        }
+        .join(meta_channel
+            .map { meta, original_file ->
+                ["${original_file.baseName}_consensus", meta]
+            })
+        .map{ file_name, new_file, meta ->
+            [meta, new_file]
+        }
+        .set{ ch_single_clusters_consensus }
 
-    //
-    // MODULE: Obtain most abundant UMI in cluster
-    //
-    VSEARCH_SORT(
-        ch_umi_bysize.cluster,
-        Channel.value("--sortbysize")
-    )
+        //
+        // MODULE: Obtain most abundant UMI in cluster
+        //
+        VSEARCH_SORT(
+            ch_umi_bysize.cluster,
+            Channel.value("--sortbysize")
+        )
 
-    // Get the correspondent fasta sequencences from top cluster sequences
-    // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
-    VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta]
-    .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta]
-    .map{ meta, fasta ->
-        fasta_line = umi_to_sequence_centroid(fasta)
-        [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...]
-    }
-    .collectFile(cache:true,sort:true) { meta, name, fasta ->
-        [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta
-    }
-    .dump(tag:"collectfile output")
-    .map{ new_file ->
-        [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta]
-    }
-    .join(meta_channel_2
-        .map { meta, original_file ->
-            ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]]
-        }) // [sample, sample_top.fasta, [id:sample_id, ...]]
-    .map{ file_name, new_file, meta ->
-        [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
-    }
-    .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
+        // Get the correspondent fasta sequencences from top cluster sequences
+        // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps
+        VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta]
+        .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta]
+        .map{ meta, fasta ->
+            fasta_line = umi_to_sequence_centroid(fasta)
+            [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...]
+        }
+        .collectFile(cache:true,sort:true) { meta, name, fasta ->
+            [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta
+        }
+        .dump(tag:"collectfile output")
+        .map{ new_file ->
+            [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta]
+        }
+        .join(meta_channel_2
+            .map { meta, original_file ->
+                ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]]
+            }) // [sample, sample_top.fasta, [id:sample_id, ...]]
+        .map{ file_name, new_file, meta ->
+            [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
+        }
+        .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta]
 
 
-    // Get the correspondent fasta sequencences from UMI clusters
-    ch_umi_bysize.cluster // [[id:sample_id, ...], sample]
-    .tap{ meta_channel_3 } // [[id:sample_id, ...], sample]
-    .map{ meta, cluster ->
-        fasta_line = umi_to_sequence(cluster)
-        [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...]
-    }
-    .dump(tag:'map allreads output')
-    .collectFile(cache:true,sort:true) { meta, name, fasta ->
-        [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta
-    }
-    .dump(tag:"collectfile allreads output")
-    .map{ new_file ->
-        [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta]
-    }
-    .join(meta_channel_3
-        .map { meta, original_file ->
-            ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]]
-        }) // [sample, sample_sequences.fasta, [id:sample_id, ...]]
-    .map{ file_name, new_file, meta ->
-        [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta]
-    }
-    .set{ ch_clusters_sequence }
+        // Get the correspondent fasta sequencences from UMI clusters
+        ch_umi_bysize.cluster // [[id:sample_id, ...], sample]
+        .tap{ meta_channel_3 } // [[id:sample_id, ...], sample]
+        .map{ meta, cluster ->
+            fasta_line = umi_to_sequence(cluster)
+            [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...]
+        }
+        .dump(tag:'map allreads output')
+        .collectFile(cache:true,sort:true) { meta, name, fasta ->
+            [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta
+        }
+        .dump(tag:"collectfile allreads output")
+        .map{ new_file ->
+            [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta]
+        }
+        .join(meta_channel_3
+            .map { meta, original_file ->
+                ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]]
+            }) // [sample, sample_sequences.fasta, [id:sample_id, ...]]
+        .map{ file_name, new_file, meta ->
+            [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta]
+        }
+        .set{ ch_clusters_sequence }
 
 
-    // Cluster consensus & polishing
-    // Two cycles of minimap2 + racon
+        // Cluster consensus & polishing
+        // Two cycles of minimap2 + racon
 
-    //
-    // MODULE: Mapping with minimap2 - cycle 1
-    //
-    // Map each cluster against the top read (most abundant UMI) in the cluster
-    MINIMAP2_ALIGN_UMI_1 (
-        ch_clusters_sequence
-            .join(ch_top_clusters_sequence),
-        false, //output in paf format
-        false,
-        false
-    )
+        //
+        // MODULE: Mapping with minimap2 - cycle 1
+        //
+        // Map each cluster against the top read (most abundant UMI) in the cluster
+        MINIMAP2_ALIGN_UMI_1 (
+            ch_clusters_sequence
+                .join(ch_top_clusters_sequence),
+            false, //output in paf format
+            false,
+            false
+        )
 
-    MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered')
+        MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered')
 
-    // Only continue with clusters that have aligned sequences
-    MINIMAP2_ALIGN_UMI_1.out.paf
-        .filter{ it[1].countLines() > 0 }
-        .set{ ch_minimap_1 }
+        // Only continue with clusters that have aligned sequences
+        MINIMAP2_ALIGN_UMI_1.out.paf
+            .filter{ it[1].countLines() > 0 }
+            .set{ ch_minimap_1 }
 
-    //
-    // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1
-    //
-    RACON_1 (
-        ch_clusters_sequence
-            .join(ch_top_clusters_sequence)
-            .join(ch_minimap_1)
-    )
+        //
+        // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1
+        //
+        RACON_1 (
+            ch_clusters_sequence
+                .join(ch_top_clusters_sequence)
+                .join(ch_minimap_1)
+        )
 
-    //
-    // MODULE: Mapping with minimap2 - cycle 2
-    //
-    // Map each cluster against the top read (most abundant UMI) in the cluster
-    MINIMAP2_ALIGN_UMI_2 (
-        ch_clusters_sequence
-            .join(RACON_1.out.improved_assembly),
-        false, //output in paf format
-        false,
-        false
-    )
+        //
+        // MODULE: Mapping with minimap2 - cycle 2
+        //
+        // Map each cluster against the top read (most abundant UMI) in the cluster
+        MINIMAP2_ALIGN_UMI_2 (
+            ch_clusters_sequence
+                .join(RACON_1.out.improved_assembly),
+            false, //output in paf format
+            false,
+            false
+        )
 
-    // Only continue with clusters that have aligned sequences
-    MINIMAP2_ALIGN_UMI_2.out.paf
-        .filter{ it[1].countLines() > 0 }
-        .set{ ch_minimap_2 }
+        // Only continue with clusters that have aligned sequences
+        MINIMAP2_ALIGN_UMI_2.out.paf
+            .filter{ it[1].countLines() > 0 }
+            .set{ ch_minimap_2 }
 
-    //
-    // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2
-    //
-    RACON_2 (
-        ch_clusters_sequence
-            .join(RACON_1.out.improved_assembly)
-            .join(ch_minimap_2)
-    )
+        //
+        // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2
+        //
+        RACON_2 (
+            ch_clusters_sequence
+                .join(RACON_1.out.improved_assembly)
+                .join(ch_minimap_2)
+        )
 
 
-    //
-    // MODULE: Obtain a consensus sequence
-    //
-    MEDAKA (
-        ch_clusters_sequence
-            .join(RACON_2.out.improved_assembly)
-    )
+        //
+        // MODULE: Obtain a consensus sequence
+        //
+        MEDAKA (
+            ch_clusters_sequence
+                .join(RACON_2.out.improved_assembly)
+        )
 
-    // Collect all consensus UMI sequences into one single file per sample
-    MEDAKA.out.assembly
-    .tap{ meta_channel_4 }
-    .map{ meta, file ->
-        file_content = file.getText()
-        [meta, file_content] // [[id:sample_id, ...], consensus_content]
-    }
-    .collectFile() { meta, file ->
-        [ "${meta.id}_consensus.fasta", file ]
-    }
-    .map{ new_file ->
-        [new_file.baseName, new_file]
-    }
-    .join(meta_channel_4
-        .map{ meta, consensus ->
-            ["${meta.id}_consensus", meta]
+        // Collect all consensus UMI sequences into one single file per sample
+        MEDAKA.out.assembly
+        .tap{ meta_channel_4 }
+        .map{ meta, file ->
+            file_content = file.getText()
+            [meta, file_content] // [[id:sample_id, ...], consensus_content]
         }
-    )
-    .map{ name, file, meta ->
-        [meta - meta.subMap('cluster_id'), file]
-    }
-    .set{ ch_umi_consensus }
+        .collectFile() { meta, file ->
+            [ "${meta.id}_consensus.fasta", file ]
+        }
+        .map{ new_file ->
+            [new_file.baseName, new_file]
+        }
+        .join(meta_channel_4
+            .map{ meta, consensus ->
+                ["${meta.id}_consensus", meta]
+            }
+        )
+        .map{ name, file, meta ->
+            [meta - meta.subMap('cluster_id'), file]
+        }
+        .set{ ch_umi_consensus }
 
 
-    //
-    // MODULE: Convert fasta to fastq
-    //
-    SEQTK_SEQ_FATOFQ (
-        ch_umi_consensus
-    )
+        //
+        // MODULE: Convert fasta to fastq
+        //
+        SEQTK_SEQ_FATOFQ (
+            ch_umi_consensus
+        )
+    }
 
+    ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ.out.fastx
 
     //
     // MODULE: Summary of clustered reads
     //
     CLUSTERING_SUMMARY (
-        SEQTK_SEQ_FATOFQ.out.fastx
+        ch_preprocess_reads
             .join(MERGING_SUMMARY.out.summary)
     )
 
@@ -551,7 +554,7 @@ workflow CRISPRSEQ_TARGETED {
     //
     if (params.aligner == "minimap2") {
         MINIMAP2_ALIGN_ORIGINAL (
-            SEQTK_SEQ_FATOFQ.out.fastx
+            ch_preprocess_reads
                 .join(ORIENT_REFERENCE.out.reference),
             true,
             false,
@@ -570,7 +573,7 @@ workflow CRISPRSEQ_TARGETED {
         )
         ch_versions = ch_versions.mix(BWA_INDEX.out.versions)
         BWA_MEM (
-            SEQTK_SEQ_FATOFQ.out.fastx,
+            ch_preprocess_reads,
             BWA_INDEX.out.index,
             true
         )
@@ -587,7 +590,7 @@ workflow CRISPRSEQ_TARGETED {
         )
         ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions)
         BOWTIE2_ALIGN (
-            SEQTK_SEQ_FATOFQ.out.fastx,
+            ch_preprocess_reads,
             BOWTIE2_BUILD.out.index,
             false,
             true

From 602a52b50c7ea49f471e486598b96619fb993249 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 27 Jun 2023 15:59:35 +0200
Subject: [PATCH 41/45] add pipeline optional step parameters to usage
 documentation

---
 docs/usage/targeted.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/usage/targeted.md b/docs/usage/targeted.md
index 667d03df..9f58cc45 100644
--- a/docs/usage/targeted.md
+++ b/docs/usage/targeted.md
@@ -55,6 +55,26 @@ chr6,chr6-61942198-61942498_R1.fastq.gz,,CAA...GGA,TTTTATGATATTTATCTTTT,TTC...CA
 
 An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv) has been provided with the pipeline.
 
+## Optional pipeline steps
+
+### Trimming of overrepresented sequences
+
+To trim the overrepresented sequences found with FastQC from the reads, use the parameter `--overrepresented`.
+Such sequences are not trimmed by default.
+When using the `--overrepresented` parameter, cutadapt is used to trim overrepresented sequences from the input FASTQ files.
+
+### UMI clustering
+
+If the provided samples were sequenced using umi-molecular identifyers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps.
+
+1. Extract UMI sequences (Python script)
+2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch))
+3. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch))
+4. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2))
+5. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon))
+6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`)
+7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html))
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:

From 09201ed49df88c4c025fa799eceb73a8e68e80a4 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 27 Jun 2023 15:59:51 +0200
Subject: [PATCH 42/45] add pipeline optional step parameters to usage
 documentation

---
 docs/usage/targeted.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/usage/targeted.md b/docs/usage/targeted.md
index 9f58cc45..f319aa56 100644
--- a/docs/usage/targeted.md
+++ b/docs/usage/targeted.md
@@ -61,11 +61,11 @@ An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv)
 
 To trim the overrepresented sequences found with FastQC from the reads, use the parameter `--overrepresented`.
 Such sequences are not trimmed by default.
-When using the `--overrepresented` parameter, cutadapt is used to trim overrepresented sequences from the input FASTQ files.
+When using the `--overrepresented` parameter, Cutadapt is used to trim overrepresented sequences from the input FASTQ files.
 
 ### UMI clustering
 
-If the provided samples were sequenced using umi-molecular identifyers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps.
+If the provided samples were sequenced using umi-molecular identifiers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps.
 
 1. Extract UMI sequences (Python script)
 2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch))

From bf99c289b7f769341f2523ae672177bacb3da20f Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 27 Jun 2023 16:04:24 +0200
Subject: [PATCH 43/45] fix alias SEQTK_SEQ_MASK

---
 conf/modules.config             | 2 +-
 workflows/crisprseq_targeted.nf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 50a27953..bdfbdd64 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -126,7 +126,7 @@ process {
         ]
     }
 
-    withName: SEQTK_SEQ {
+    withName: SEQTK_SEQ_MASK {
         ext.args = '-q 20 -L 80 -n N'
         publishDir = [
             path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index b7f0e025..8aace856 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -538,7 +538,7 @@ workflow CRISPRSEQ_TARGETED {
         )
     }
 
-    ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ.out.fastx
+    ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ_MASK.out.fastx
 
     //
     // MODULE: Summary of clustered reads

From fb1c14fcb8aacebfd4ced525241954f0ea5c3b13 Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Tue, 27 Jun 2023 17:21:47 +0200
Subject: [PATCH 44/45] fix: MERGING_SUMMARY runs when --overrepresented is
 false

---
 templates/merging_summary.py    | 26 ++++++++++++++++----------
 workflows/crisprseq_targeted.nf | 32 ++++++++++++++++++++++----------
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/templates/merging_summary.py b/templates/merging_summary.py
index 0eb1239a..804a5be3 100755
--- a/templates/merging_summary.py
+++ b/templates/merging_summary.py
@@ -6,22 +6,28 @@
 
 with gzip.open("${raw_reads[0]}", "rt") as handle:
     raw_reads_count = len(list(SeqIO.parse(handle, "fastq")))
-if "$assembled_reads" == "null":
+
+if "$assembled_reads" == "null_a":
     assembled_reads_count = 0
 else:
     with gzip.open("$assembled_reads", "rt") as handle:
         assembled_reads_count = len(list(SeqIO.parse(handle, "fastq")))  # Merged reads R1+R2
+
 with gzip.open("$trimmed_reads", "rt") as handle:
     trimmed_reads_count = len(list(SeqIO.parse(handle, "fastq")))  # Filtered reads
 
-with open("$trimmed_adapters", "r") as handle:
-    for line in handle:
-        if line.startswith("Reads with adapters"):
-            for field in "".join(line.split(",")).split():
-                if field.isdigit():
-                    adapters_count = field  # reads with adapters
-                if "%" in field:
-                    adapters_percentage = field  # percentage of reads with adapters: ex. "(100.0%)"
+if "$trimmed_adapters" == "null_t":
+    adapters_count = 0
+    adapters_percentage = "(0.0%)"
+else:
+    with open("$trimmed_adapters", "r") as handle:
+        for line in handle:
+            if line.startswith("Reads with adapters"):
+                for field in "".join(line.split(",")).split():
+                    if field.isdigit():
+                        adapters_count = field  # reads with adapters
+                    if "%" in field:
+                        adapters_percentage = field  # percentage of reads with adapters: ex. "(100.0%)"
 
 if "$task.ext.prefix" != "null":
     prefix = "$task.ext.prefix"
@@ -35,7 +41,7 @@
         f"merged-reads, {assembled_reads_count} ({round(assembled_reads_count * 100 / raw_reads_count,1)}%)\\n"
     )
     output_file.write(f"reads-with-adapters, {adapters_count} {adapters_percentage}\\n")
-    if "$assembled_reads" == "null":
+    if "$assembled_reads" == "null_a":
         output_file.write(
             f"quality-filtered-reads, {trimmed_reads_count} ({round(trimmed_reads_count * 100 / raw_reads_count,1)}%)\\n"
         )
diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf
index 8aace856..9484ee28 100644
--- a/workflows/crisprseq_targeted.nf
+++ b/workflows/crisprseq_targeted.nf
@@ -303,24 +303,36 @@ workflow CRISPRSEQ_TARGETED {
     )
     ch_versions = ch_versions.mix(SEQTK_SEQ_MASK.out.versions)
 
-
-    //
-    // MODULE: Summary of merged reads
-    //
-    MERGING_SUMMARY {
+    if (params.overrepresented) {
+        ch_cat_fastq.paired
+            .mix(ch_cat_fastq.single)
+            .join(PEAR.out.assembled, remainder: true)
+            .join(SEQTK_SEQ_MASK.out.fastx)
+            .join(CUTADAPT.out.log)
+            .set { ch_merging_summary_data }
+    } else {
         ch_cat_fastq.paired
             .mix(ch_cat_fastq.single)
             .join(PEAR.out.assembled, remainder: true)
             .join(SEQTK_SEQ_MASK.out.fastx)
-            .join(( params.overrepresented ? CUTADAPT.out.log : Channel.value("null") ))
+            .combine(Channel.value("null"))
             .map { meta, reads, assembled, masked, trimmed ->
                 if (assembled == null) {
-                    dummy = file('null')
-                    return [ meta, reads, dummy, masked, trimmed ]
-                } else {
-                    return [ meta, reads, assembled, masked, trimmed ]
+                    assembled = file('null_a')
+                }
+                if (trimmed == "null") {
+                    trimmed = file('null_t')
                 }
+                return [ meta, reads, assembled, masked, trimmed ]
             }
+            .set { ch_merging_summary_data }
+    }
+
+    //
+    // MODULE: Summary of merged reads
+    //
+    MERGING_SUMMARY {
+        ch_merging_summary_data
     }
 
 

From 7f9bfda971dfe8eeaf16ee273d8b49dcdae16c7f Mon Sep 17 00:00:00 2001
From: mirpedrol <mirp.julia@gmail.com>
Date: Wed, 28 Jun 2023 08:51:53 +0200
Subject: [PATCH 45/45] add param umi_clustering to test_umis

---
 conf/test_umis.config | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/conf/test_umis.config b/conf/test_umis.config
index cfa11b55..6efc80d8 100644
--- a/conf/test_umis.config
+++ b/conf/test_umis.config
@@ -20,8 +20,9 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input     = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv'
-    analysis = 'targeted'
+    input          = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv'
+    analysis       = 'targeted'
+    umi_clustering = true
 
     // Aligner
     aligner = 'minimap2'