From 760fa55487852d887a4140ca9c52b45bb8c36d54 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Wed, 15 Mar 2023 09:54:26 +0100 Subject: [PATCH 01/45] add extract umi step --- bin/extract_umis.py | 8 ++++++-- workflows/crisprseq.nf | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bin/extract_umis.py b/bin/extract_umis.py index 78802c9f..e4792d0f 100755 --- a/bin/extract_umis.py +++ b/bin/extract_umis.py @@ -1,6 +1,10 @@ #!/usr/bin/env python3 -# FROM: pipeline-umi-amplicon distributed by ONT https://github.com/nanoporetech/pipeline-umi-amplicon -# https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py +# +# This code is obtained from Oxford Nanopore Technologies pipeline-umi-amplicon +# Distributed under the Mozilla Public License, v. 2.0 +# +# Original source code: https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py +# import argparse import logging diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 11174d93..fe06f95f 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -269,8 +269,6 @@ workflow CRISPRSEQ { } - /* - The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: // // MODULE: Extract UMI sequences @@ -279,6 +277,10 @@ workflow CRISPRSEQ { SEQTK_SEQ.out.fastx ) + /* + The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: + + Modules to implement: vsearch From 21cda6b85a715ce492d6b6b9702ac1d1a38f9252 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Wed, 15 Mar 2023 10:33:06 +0100 Subject: [PATCH 02/45] add test_umis.config --- conf/test_umis.config | 27 +++++++++++++++++++++++++++ nextflow.config | 1 + workflows/crisprseq.nf | 2 ++ 3 files changed, 30 insertions(+) create mode 100644 conf/test_umis.config diff --git a/conf/test_umis.config b/conf/test_umis.config new file mode 100644 index 00000000..42eab8f8 --- /dev/null +++ b/conf/test_umis.config @@ -0,0 +1,27 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests (with UMIs option) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/crisprseq -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile UMIs' + config_profile_description = 'Minimal test dataset to check pipeline function with UMIs option' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv' + + // Aligner + aligner = 'minimap2' +} diff --git a/nextflow.config b/nextflow.config index da037ee2..e415ebe9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -144,6 +144,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_umis { includeConfig 'conf/test_umis.config' } } diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index fe06f95f..439d60d5 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -277,6 +277,8 @@ workflow CRISPRSEQ { SEQTK_SEQ.out.fastx ) + EXTRACT_UMIS.out.fasta.view() + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: From 373b8095499dcd09a6b40b965d7397843cb881da Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 17 Mar 2023 11:42:18 +0100 Subject: [PATCH 03/45] fix extract_umis --- modules/local/extract_umis.nf | 6 +++--- workflows/crisprseq.nf | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/local/extract_umis.nf b/modules/local/extract_umis.nf index 4f9be2a3..d6c8fb44 100644 --- a/modules/local/extract_umis.nf +++ b/modules/local/extract_umis.nf @@ -11,9 +11,9 @@ process EXTRACT_UMIS { tuple val(meta), path(reads) output: - tuple val(meta), path(fasta) , emit: fasta - tuple val(meta), path(tsv) , emit: tsv - path "versions.yml" , emit: versions + tuple val(meta), path("*.fasta") , emit: fasta + tuple val(meta), path("*.tsv") , emit: tsv + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 439d60d5..48ceab53 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -269,7 +269,6 @@ workflow CRISPRSEQ { } - // // MODULE: Extract UMI sequences // From ab49b680eaafef5b36918e26e3d663c02a743d45 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 17 Mar 2023 13:08:59 +0100 Subject: [PATCH 04/45] install vsearch --- modules.json | 5 ++ modules/nf-core/vsearch/cluster/main.nf | 73 ++++++++++++++++++++++++ modules/nf-core/vsearch/cluster/meta.yml | 68 ++++++++++++++++++++++ workflows/crisprseq.nf | 1 + 4 files changed, 147 insertions(+) create mode 100644 modules/nf-core/vsearch/cluster/main.nf create mode 100644 modules/nf-core/vsearch/cluster/meta.yml diff --git a/modules.json b/modules.json index 175f4522..350f06ae 100644 --- a/modules.json +++ b/modules.json @@ -71,6 +71,11 @@ "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] + }, + "vsearch/cluster": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf new file mode 100644 index 00000000..7bcda8ac --- /dev/null +++ b/modules/nf-core/vsearch/cluster/main.nf @@ -0,0 +1,73 @@ +process VSEARCH_CLUSTER { + tag "$meta.id" + label 'process_low' + + conda "bioconda::vsearch=2.21.1 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0': + 'quay.io/biocontainers/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.aln.gz') , optional: true, emit: aln + tuple val(meta), path('*.biom.gz') , optional: true, emit: biom + tuple val(meta), path('*.mothur.tsv.gz') , optional: true, emit: mothur + tuple val(meta), path('*.otu.tsv.gz') , optional: true, emit: otu + tuple val(meta), path('*.bam') , optional: true, emit: bam + tuple val(meta), path('*.out.tsv.gz') , optional: true, emit: out + tuple val(meta), path('*.blast.tsv.gz') , optional: true, emit: blast + tuple val(meta), path('*.uc.tsv.gz') , optional: true, emit: uc + tuple val(meta), path('*.centroids.fasta.gz') , optional: true, emit: centroids + tuple val(meta), path('*.clusters.fasta.gz') , optional: true, emit: clusters + tuple val(meta), path('*.profile.txt.gz') , optional: true, emit: profile + tuple val(meta), path('*.msa.fasta.gz') , optional: true, emit: msa + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { + error "Unknown clustering option provided (${args2})" + } + def out_ext = args3.contains("--alnout") ? "aln" : + args3.contains("--biomout") ? "biom" : + args3.contains("--blast6out") ? "blast.tsv" : + args3.contains("--centroids") ? "centroids.fasta" : + args3.contains("--clusters") ? "clusters.fasta" : + args3.contains("--mothur_shared_out") ? "mothur.tsv" : + args3.contains("--msaout") ? "msa.fasta" : + args3.contains("--otutabout") ? "otu.tsv" : + args3.contains("--profile") ? "profile.txt" : + args3.contains("--samout") ? "sam" : + args3.contains("--uc") ? "uc.tsv" : + args3.contains("--userout") ? "out.tsv" : + "" + if (out_ext == "") { error "Unknown output file format provided (${args3})" } + """ + vsearch \\ + $args2 $fasta \\ + $args3 ${prefix}.${out_ext} \\ + --threads $task.cpus \\ + $args + + if [[ $args3 != "--samout" ]] + then + gzip -n ${prefix}.${out_ext} + else + samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vsearch/cluster/meta.yml b/modules/nf-core/vsearch/cluster/meta.yml new file mode 100644 index 00000000..5cc620cb --- /dev/null +++ b/modules/nf-core/vsearch/cluster/meta.yml @@ -0,0 +1,68 @@ +name: "vsearch_cluster" +description: Cluster sequences using a single-pass, greedy centroid-based clustering algorithm. +keywords: + - vsearch + - clustering +tools: + - vsearch: + description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative) + homepage: https://github.com/torognes/vsearch + documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf + tool_dev_url: https://github.com/torognes/vsearch + doi: 10.7717/peerj.2584 + licence: ["GPL v3-or-later OR BSD-2-clause"] + +input: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test' ] + - fasta: + type: file + description: Sequences to cluster in FASTA format + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - aln: + type: file + description: Results in pairwise alignment format + pattern: "*.aln.gz" + - biom: + type: file + description: Results in an OTU table in the biom version 1.0 file format + pattern: "*.biom.gz" + - mothur: + type: file + description: Results in an OTU table in the mothur ’shared’ tab-separated plain text file format + pattern: "*.mothur.tsv.gz" + - otu: + type: file + description: Results in an OTU table in the classic tab-separated plain text format + pattern: "*.otu.tsv.gz" + - bam: + type: file + description: Results written in bam format + pattern: "*.bam" + - out: + type: file + description: Results in tab-separated output, columns defined by user + pattern: "*.out.tsv.gz" + - blast: + type: file + description: Tab delimited results in blast-like tabular format + pattern: "*.blast.tsv.gz" + - uc: + type: file + description: Tab delimited results in a uclust-like format with 10 columns + pattern: "*.uc.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mirpedrol" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 48ceab53..d7d0c55e 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -68,6 +68,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-co include { PEAR } from '../modules/nf-core/pear/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' include { SEQTK_SEQ } from '../modules/nf-core/seqtk/seq/main' +include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' include { BOWTIE2_ALIGN } from '../modules/nf-core/bowtie2/align/main' include { BOWTIE2_BUILD } from '../modules/nf-core/bowtie2/build/main' include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' From 1eb72b8888293b4fa6dac9b6ae2a70f6c7a21e6b Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 20 Mar 2023 09:19:26 +0100 Subject: [PATCH 05/45] add vsearch/cluster step --- conf/modules.config | 6 ++ modules.json | 3 +- modules/nf-core/vsearch/cluster/main.nf | 29 +++----- .../vsearch/cluster/vsearch-cluster.diff | 50 +++++++++++++ nextflow.config | 4 ++ nextflow_schema.json | 72 ++++++++++++++----- workflows/crisprseq.nf | 5 +- 7 files changed, 132 insertions(+), 37 deletions(-) create mode 100644 modules/nf-core/vsearch/cluster/vsearch-cluster.diff diff --git a/conf/modules.config b/conf/modules.config index eb6eb831..f4a637a7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,6 +105,12 @@ process { ext.args = '--max-error 3 --adapter-length 250 --fwd-context ""' } + withName: VSEARCH_CLUSTER { + ext.args = "--minseqlength ${params.vsearch_minseqlength} --maxseqlength ${params.vsearch_maxseqlength} --qmask none --clusterout_sort --gapopen 0E/5I --gapext 0E/2I --mismatch -8 --match 6 --iddef 0 --minwordmatches 0 --qmask none -id ${params.vsearch_id}" + ext.args2 = '--cluster_fast' + ext.args3 = '--clusters' + } + withName: MERGING_SUMMARY { publishDir = [ path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, diff --git a/modules.json b/modules.json index 350f06ae..c741cfd1 100644 --- a/modules.json +++ b/modules.json @@ -75,7 +75,8 @@ "vsearch/cluster": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/vsearch/cluster/vsearch-cluster.diff" } } }, diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf index 7bcda8ac..c450470b 100644 --- a/modules/nf-core/vsearch/cluster/main.nf +++ b/modules/nf-core/vsearch/cluster/main.nf @@ -37,16 +37,16 @@ process VSEARCH_CLUSTER { if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { error "Unknown clustering option provided (${args2})" } - def out_ext = args3.contains("--alnout") ? "aln" : - args3.contains("--biomout") ? "biom" : - args3.contains("--blast6out") ? "blast.tsv" : - args3.contains("--centroids") ? "centroids.fasta" : - args3.contains("--clusters") ? "clusters.fasta" : - args3.contains("--mothur_shared_out") ? "mothur.tsv" : - args3.contains("--msaout") ? "msa.fasta" : - args3.contains("--otutabout") ? "otu.tsv" : - args3.contains("--profile") ? "profile.txt" : - args3.contains("--samout") ? "sam" : + def out_ext = args3.contains("--alnout") ? ".aln" : + args3.contains("--biomout") ? ".biom" : + args3.contains("--blast6out") ? ".blast.tsv" : + args3.contains("--centroids") ? ".centroids.fasta" : + args3.contains("--clusters") ? "_clusters" : + args3.contains("--mothur_shared_out") ? ".mothur.tsv" : + args3.contains("--msaout") ? ".msa.fasta" : + args3.contains("--otutabout") ? ".otu.tsv" : + args3.contains("--profile") ? ".profile.txt" : + args3.contains("--samout") ? ".sam" : args3.contains("--uc") ? "uc.tsv" : args3.contains("--userout") ? "out.tsv" : "" @@ -54,17 +54,10 @@ process VSEARCH_CLUSTER { """ vsearch \\ $args2 $fasta \\ - $args3 ${prefix}.${out_ext} \\ + $args3 ${prefix}${out_ext} \\ --threads $task.cpus \\ $args - if [[ $args3 != "--samout" ]] - then - gzip -n ${prefix}.${out_ext} - else - samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam - fi - cat <<-END_VERSIONS > versions.yml "${task.process}": vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//') diff --git a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff new file mode 100644 index 00000000..13ad9aa5 --- /dev/null +++ b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff @@ -0,0 +1,50 @@ +Changes in module 'nf-core/vsearch/cluster' +--- modules/nf-core/vsearch/cluster/main.nf ++++ modules/nf-core/vsearch/cluster/main.nf +@@ -37,16 +37,16 @@ + if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { + error "Unknown clustering option provided (${args2})" + } +- def out_ext = args3.contains("--alnout") ? "aln" : +- args3.contains("--biomout") ? "biom" : +- args3.contains("--blast6out") ? "blast.tsv" : +- args3.contains("--centroids") ? "centroids.fasta" : +- args3.contains("--clusters") ? "clusters.fasta" : +- args3.contains("--mothur_shared_out") ? "mothur.tsv" : +- args3.contains("--msaout") ? "msa.fasta" : +- args3.contains("--otutabout") ? "otu.tsv" : +- args3.contains("--profile") ? "profile.txt" : +- args3.contains("--samout") ? "sam" : ++ def out_ext = args3.contains("--alnout") ? ".aln" : ++ args3.contains("--biomout") ? ".biom" : ++ args3.contains("--blast6out") ? ".blast.tsv" : ++ args3.contains("--centroids") ? ".centroids.fasta" : ++ args3.contains("--clusters") ? "_clusters" : ++ args3.contains("--mothur_shared_out") ? ".mothur.tsv" : ++ args3.contains("--msaout") ? ".msa.fasta" : ++ args3.contains("--otutabout") ? ".otu.tsv" : ++ args3.contains("--profile") ? ".profile.txt" : ++ args3.contains("--samout") ? ".sam" : + args3.contains("--uc") ? "uc.tsv" : + args3.contains("--userout") ? "out.tsv" : + "" +@@ -54,16 +54,9 @@ + """ + vsearch \\ + $args2 $fasta \\ +- $args3 ${prefix}.${out_ext} \\ ++ $args3 ${prefix}${out_ext} \\ + --threads $task.cpus \\ + $args +- +- if [[ $args3 != "--samout" ]] +- then +- gzip -n ${prefix}.${out_ext} +- else +- samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam +- fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/nextflow.config b/nextflow.config index e415ebe9..1aaf3c0e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,10 @@ params { input = null aligner = 'minimap2' + // Vsearch options + vsearch_minseqlength = 55 + vsearch_maxseqlength = 57 + vsearch_id = 0.99 // References genome = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 20ed8752..efad63df 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,22 +5,6 @@ "description": "Pipeline for the analysis of crispr data", "type": "object", "definitions": { - "alignment_parameters": { - "title": "Alignment parameters", - "type": "object", - "description": "", - "default": "", - "properties": { - "aligner": { - "type": "string", - "description": "Aligner program to use", - "default": "minimap2", - "fa_icon": "fas fa-align-justify", - "enum": ["minimap2", "bwa", "bowtie2"] - } - }, - "required": ["aligner"] - }, "input_output_options": { "title": "Input/output options", "type": "object", @@ -58,6 +42,57 @@ } } }, + "alignment_parameters": { + "title": "Alignment parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "aligner": { + "type": "string", + "description": "Aligner program to use.", + "default": "minimap2", + "fa_icon": "fas fa-align-justify", + "enum": ["minimap2", "bwa", "bowtie2"] + } + }, + "required": ["aligner"], + "fa_icon": "fas fa-align-justify" + }, + "vsearch_parameters": { + "title": "Vsearch parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "vsearch_minseqlength": { + "type": "integer", + "default": 55, + "fa_icon": "fas fa-minus", + "description": "Vsearch minimum sequence length.", + "help_text": "Discard sequences shorter than vsearch_minseqlength.", + "minimum": 1 + }, + "vsearch_maxseqlength": { + "type": "integer", + "default": 57, + "fa_icon": "fas fa-plus", + "description": "Vsearch maximum sequence length.", + "help_text": "Discard sequences longer than vsearch_minseqlength.", + "minimum": 1 + }, + "vsearch_id": { + "type": "number", + "default": 0.99, + "fa_icon": "fas fa-equals", + "description": "Vsearch pairwise identity threshold.", + "help_text": "Do not add the target to the cluster if the pairwise identity with the centroid is lower than id. The pairwise identity is defined as the number of (matching columns) / (alignment length - terminal gaps).", + "minimum": 0, + "maximum": 1 + } + }, + "fa_icon": "fas fa-layer-group" + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -284,11 +319,14 @@ } }, "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, { "$ref": "#/definitions/alignment_parameters" }, { - "$ref": "#/definitions/input_output_options" + "$ref": "#/definitions/vsearch_parameters" }, { "$ref": "#/definitions/reference_genome_options" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index d7d0c55e..f60b937d 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -277,7 +277,10 @@ workflow CRISPRSEQ { SEQTK_SEQ.out.fastx ) - EXTRACT_UMIS.out.fasta.view() + + VSEARCH_CLUSTER ( + EXTRACT_UMIS.out.fasta + ) /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: From 8467cd9c1303190bec94579372b491a1796e9cf9 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 20 Mar 2023 10:09:46 +0100 Subject: [PATCH 06/45] don't compress cluster fastas --- modules/nf-core/vsearch/cluster/main.nf | 2 +- modules/nf-core/vsearch/cluster/vsearch-cluster.diff | 9 +++++++++ workflows/crisprseq.nf | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf index c450470b..413f70c9 100644 --- a/modules/nf-core/vsearch/cluster/main.nf +++ b/modules/nf-core/vsearch/cluster/main.nf @@ -20,7 +20,7 @@ process VSEARCH_CLUSTER { tuple val(meta), path('*.blast.tsv.gz') , optional: true, emit: blast tuple val(meta), path('*.uc.tsv.gz') , optional: true, emit: uc tuple val(meta), path('*.centroids.fasta.gz') , optional: true, emit: centroids - tuple val(meta), path('*.clusters.fasta.gz') , optional: true, emit: clusters + tuple val(meta), path('*_clusters*') , optional: true, emit: clusters tuple val(meta), path('*.profile.txt.gz') , optional: true, emit: profile tuple val(meta), path('*.msa.fasta.gz') , optional: true, emit: msa path "versions.yml" , emit: versions diff --git a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff index 13ad9aa5..ff6808ac 100644 --- a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff +++ b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff @@ -1,6 +1,15 @@ Changes in module 'nf-core/vsearch/cluster' --- modules/nf-core/vsearch/cluster/main.nf +++ modules/nf-core/vsearch/cluster/main.nf +@@ -20,7 +20,7 @@ + tuple val(meta), path('*.blast.tsv.gz') , optional: true, emit: blast + tuple val(meta), path('*.uc.tsv.gz') , optional: true, emit: uc + tuple val(meta), path('*.centroids.fasta.gz') , optional: true, emit: centroids +- tuple val(meta), path('*.clusters.fasta.gz') , optional: true, emit: clusters ++ tuple val(meta), path('*_clusters*') , optional: true, emit: clusters + tuple val(meta), path('*.profile.txt.gz') , optional: true, emit: profile + tuple val(meta), path('*.msa.fasta.gz') , optional: true, emit: msa + path "versions.yml" , emit: versions @@ -37,16 +37,16 @@ if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { error "Unknown clustering option provided (${args2})" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index f60b937d..f5d7c796 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -282,6 +282,8 @@ workflow CRISPRSEQ { EXTRACT_UMIS.out.fasta ) + VSEARCH_CLUSTER.out.clusters.transpose().view() + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: From bc43a971837f0e2e9db45b7cb8794315b4948fb1 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 21 Mar 2023 10:48:32 +0100 Subject: [PATCH 07/45] add closure to obtain sequence from umi --- workflows/crisprseq.nf | 69 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index f5d7c796..48370acf 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -48,6 +48,7 @@ include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' include { ORIENT_REFERENCE } from '../modules/local/orient_reference' include { CIGAR_PARSER } from '../modules/local/cigar_parser' include { MERGING_SUMMARY } from '../modules/local/merging_summary' +include { UMI_TO_SEQUENCE } from '../modules/local/umi_to_sequence' include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' @@ -79,6 +80,32 @@ include { CUTADAPT } from '../modules/nf-co include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + DEFINE GROOVY FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +sufix = "" + +umi_to_sequence = { meta, cluster -> + cluster.withReader { source -> + output = file("${cluster.baseName}_${sufix}.fasta") + output.withWriter { target -> + String line + while ( line=source.readLine() ) { + if (line.startsWith(">")) { + sequence = (line =~ /;seq=(.*$)/)[0][1] + id = (line =~ /(>.*?);/)[0][1] + } + target << id + "\n" + sequence + } + } + } + return [meta, output] +} + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -278,11 +305,49 @@ workflow CRISPRSEQ { ) + // + // MODULE: Cluster UMIs + // VSEARCH_CLUSTER ( EXTRACT_UMIS.out.fasta ) - VSEARCH_CLUSTER.out.clusters.transpose().view() + // Obtain a file with UBS (UBI bin size) and UMI ID + VSEARCH_CLUSTER.out.clusters + .transpose() + .collectFile( storeDir:params.outdir ) { + it -> + [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ] + } + + // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences + VSEARCH_CLUSTER.out.clusters + .transpose() + .branch{ + meta, cluster -> + single: cluster.countFasta() == 1 + return [meta, cluster] + cluster: cluster.countFasta() > 1 + return [meta, cluster] + } + .set{ ch_umi_bysize } + + // Get the correspondent fasta sequencences from single clusters + sufix = "consensus" + + ch_umi_bysize.single + .map(umi_to_sequence) + .set{ ch_cluster_sequence } + + + ch_cluster_sequence.view() + + // + // MODULE: Obtain longest read in cluster + // + VSEARCH_SORT( + + ) /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: @@ -290,8 +355,6 @@ workflow CRISPRSEQ { Modules to implement: - vsearch - get_ubs top_read polishing: minimap2, racon consensus From 181c3d5b2ec439a191edc981140a4c1f9c037abc Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 24 Mar 2023 11:16:52 +0100 Subject: [PATCH 08/45] try to make umi_to_sequence.py faster --- modules/local/umi_to_sequence.nf | 21 ++++++++++++++++ templates/umi_to_sequence.py | 11 +++++++++ workflows/crisprseq.nf | 41 +++++--------------------------- 3 files changed, 38 insertions(+), 35 deletions(-) create mode 100644 modules/local/umi_to_sequence.nf create mode 100644 templates/umi_to_sequence.py diff --git a/modules/local/umi_to_sequence.nf b/modules/local/umi_to_sequence.nf new file mode 100644 index 00000000..aa3be435 --- /dev/null +++ b/modules/local/umi_to_sequence.nf @@ -0,0 +1,21 @@ +process UMI_TO_SEQUENCE { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::p7zip==16.02" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : + 'quay.io/biocontainers/p7zip:16.02' }" + + input: + tuple val(meta), path(cluster) + + output: + tuple val(meta), path("*.fasta"), emit: fasta + + when: + task.ext.when == null || task.ext.when + + script: + template 'umi_to_sequence.py' +} diff --git a/templates/umi_to_sequence.py b/templates/umi_to_sequence.py new file mode 100644 index 00000000..09a24bd0 --- /dev/null +++ b/templates/umi_to_sequence.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +#### Extract the read sequence from a FASTA file of UMIs (output of extract_umis.py) +#### author: Júlia Mir Pedrol @mirpedrol +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. + +with open("$cluster") as i: + for line in i: + split = line.split(";") + with open("${cluster.baseName}_${task.ext.prefix}.fasta", "w") as o: + o.write(f"{line.split(';')[0]}\n{line.split('=')[-1]}") diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 48370acf..5d985cf1 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -80,32 +80,6 @@ include { CUTADAPT } from '../modules/nf-co include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - DEFINE GROOVY FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -sufix = "" - -umi_to_sequence = { meta, cluster -> - cluster.withReader { source -> - output = file("${cluster.baseName}_${sufix}.fasta") - output.withWriter { target -> - String line - while ( line=source.readLine() ) { - if (line.startsWith(">")) { - sequence = (line =~ /;seq=(.*$)/)[0][1] - id = (line =~ /(>.*?);/)[0][1] - } - target << id + "\n" + sequence - } - } - } - return [meta, output] -} - - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -333,21 +307,18 @@ workflow CRISPRSEQ { .set{ ch_umi_bysize } // Get the correspondent fasta sequencences from single clusters - sufix = "consensus" - - ch_umi_bysize.single - .map(umi_to_sequence) - .set{ ch_cluster_sequence } - + // + UMI_TO_SEQUENCE ( + ch_umi_bysize.single + ) - ch_cluster_sequence.view() // // MODULE: Obtain longest read in cluster // - VSEARCH_SORT( + //VSEARCH_SORT( - ) + //) /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: From 2198371bf2bfcc06f493e8c46d675cbc6dea481e Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 24 Mar 2023 15:38:33 +0100 Subject: [PATCH 09/45] use a groovy function for umi_to_sequence --- modules/local/umi_to_sequence.nf | 21 -------------------- templates/umi_to_sequence.py | 11 ----------- workflows/crisprseq.nf | 34 +++++++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 37 deletions(-) delete mode 100644 modules/local/umi_to_sequence.nf delete mode 100644 templates/umi_to_sequence.py diff --git a/modules/local/umi_to_sequence.nf b/modules/local/umi_to_sequence.nf deleted file mode 100644 index aa3be435..00000000 --- a/modules/local/umi_to_sequence.nf +++ /dev/null @@ -1,21 +0,0 @@ -process UMI_TO_SEQUENCE { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::p7zip==16.02" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : - 'quay.io/biocontainers/p7zip:16.02' }" - - input: - tuple val(meta), path(cluster) - - output: - tuple val(meta), path("*.fasta"), emit: fasta - - when: - task.ext.when == null || task.ext.when - - script: - template 'umi_to_sequence.py' -} diff --git a/templates/umi_to_sequence.py b/templates/umi_to_sequence.py deleted file mode 100644 index 09a24bd0..00000000 --- a/templates/umi_to_sequence.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python3 - -#### Extract the read sequence from a FASTA file of UMIs (output of extract_umis.py) -#### author: Júlia Mir Pedrol @mirpedrol -#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. - -with open("$cluster") as i: - for line in i: - split = line.split(";") - with open("${cluster.baseName}_${task.ext.prefix}.fasta", "w") as o: - o.write(f"{line.split(';')[0]}\n{line.split('=')[-1]}") diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 5d985cf1..fa768c23 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -48,7 +48,6 @@ include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' include { ORIENT_REFERENCE } from '../modules/local/orient_reference' include { CIGAR_PARSER } from '../modules/local/cigar_parser' include { MERGING_SUMMARY } from '../modules/local/merging_summary' -include { UMI_TO_SEQUENCE } from '../modules/local/umi_to_sequence' include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' @@ -80,6 +79,26 @@ include { CUTADAPT } from '../modules/nf-co include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + DEFINE GROOVY FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +def umi_to_sequence(cluster) { + cluster.withReader { source -> + String line + while ( line=source.readLine() ) { + if (line.startsWith(">")) { + sequence = (line =~ /;seq=(.*$)/)[0][1] + id = (line =~ /(>.*?);/)[0][1] + } + } + } + return id + "\n" + sequence +} + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -307,10 +326,15 @@ workflow CRISPRSEQ { .set{ ch_umi_bysize } // Get the correspondent fasta sequencences from single clusters - // - UMI_TO_SEQUENCE ( - ch_umi_bysize.single - ) + ch_umi_bysize.single + .map{ meta, cluster -> + fasta_line = umi_to_sequence(cluster) + [meta, cluster.baseName, fasta_line] + } + .collectFile() { meta, name, fasta -> + [ "{$name}_consensus.fasta", fasta ] + } + .set{ ch_single_clusters_consensus } // From 828dbf5729dcab6bca09b554216af80cafab14ed Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Wed, 5 Apr 2023 15:03:27 +0200 Subject: [PATCH 10/45] install vsearch/sort --- modules.json | 5 +++ modules/nf-core/vsearch/sort/main.nf | 47 +++++++++++++++++++++++++ modules/nf-core/vsearch/sort/meta.yml | 49 +++++++++++++++++++++++++++ workflows/crisprseq.nf | 1 + 4 files changed, 102 insertions(+) create mode 100644 modules/nf-core/vsearch/sort/main.nf create mode 100644 modules/nf-core/vsearch/sort/meta.yml diff --git a/modules.json b/modules.json index c741cfd1..afa23421 100644 --- a/modules.json +++ b/modules.json @@ -77,6 +77,11 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"], "patch": "modules/nf-core/vsearch/cluster/vsearch-cluster.diff" + }, + "vsearch/sort": { + "branch": "master", + "git_sha": "2729a013f2c8930ea39d747e2fa589ca89d201ee", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/vsearch/sort/main.nf b/modules/nf-core/vsearch/sort/main.nf new file mode 100644 index 00000000..5f410025 --- /dev/null +++ b/modules/nf-core/vsearch/sort/main.nf @@ -0,0 +1,47 @@ +process VSEARCH_SORT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::vsearch=2.21.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/vsearch:2.21.1--h95f258a_0': + 'quay.io/biocontainers/vsearch:2.21.1--h95f258a_0' }" + + input: + tuple val(meta), path(fasta) + val sort_arg + + output: + tuple val(meta), path("*_sorted.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$fasta" == "${prefix}.fasta") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + vsearch \\ + $sort_arg $fasta \\ + --threads $task.cpus \\ + --output ${prefix}.fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g;s/,.*//g;s/^v//;s/_.*//') + END_VERSIONS + """ + + stub: + """ + touch ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g;s/,.*//g;s/^v//;s/_.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vsearch/sort/meta.yml b/modules/nf-core/vsearch/sort/meta.yml new file mode 100644 index 00000000..a527b7f1 --- /dev/null +++ b/modules/nf-core/vsearch/sort/meta.yml @@ -0,0 +1,49 @@ +name: "vsearch_sort" +description: Sort fasta entries by decreasing abundance (--sortbysize) or sequence length (--sortbylength). +keywords: + - vsearch/sort + - vsearch + - sort + - amplicon sequences + - metagenomics + - genomics + - population genetics +tools: + - vsearch: + description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative) + homepage: https://github.com/torognes/vsearch + documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf + tool_dev_url: https://github.com/torognes/vsearch + doi: 10.7717/peerj.2584 + licence: ["GPL v3-or-later OR BSD-2-clause"] + +input: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test' ] + - fasta: + type: file + description: Sequences to be sorted in FASTA format + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + - sort_arg: + type: string + description: Argument to provide to sort algorithm. Sort by abundance with --sortbysize or by sequence length with --sortbylength. + enums: ["--sortbysize", "--sortbylength"] + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Sorted FASTA file + pattern: "*.{fasta}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mirpedrol" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index fa768c23..2cd4d084 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -69,6 +69,7 @@ include { PEAR } from '../modules/nf-co include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' include { SEQTK_SEQ } from '../modules/nf-core/seqtk/seq/main' include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' +include { VSEARCH_SORT } from '../modules/nf-core/vsearch/sort/main' include { BOWTIE2_ALIGN } from '../modules/nf-core/bowtie2/align/main' include { BOWTIE2_BUILD } from '../modules/nf-core/bowtie2/build/main' include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' From 731ffb3437e56fe0155408f7234576938504bccc Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Wed, 5 Apr 2023 15:13:21 +0200 Subject: [PATCH 11/45] add umi_bin_size parameter --- nextflow.config | 3 +++ nextflow_schema.json | 22 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 1aaf3c0e..be5bd498 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,9 @@ params { input = null aligner = 'minimap2' + // UMI parameters + umi_bin_size = 1 + // Vsearch options vsearch_minseqlength = 55 vsearch_maxseqlength = 57 diff --git a/nextflow_schema.json b/nextflow_schema.json index efad63df..74b49103 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,10 +42,25 @@ } } }, + "umi_parameters": { + "title": "UMI parameters", + "type": "object", + "description": "Parameters regarding umi molecular identifiers (UMIs)", + "default": "", + "properties": { + "umi_bin_size": { + "type": "integer", + "default": 1, + "description": "Minimum size of a UMI cluster.", + "fa_icon": "fas fa-sort-amount-down" + } + }, + "fa_icon": "fas fa-layer-group" + }, "alignment_parameters": { "title": "Alignment parameters", "type": "object", - "description": "", + "description": "Parameters used for alignment processes", "default": "", "properties": { "aligner": { @@ -62,7 +77,7 @@ "vsearch_parameters": { "title": "Vsearch parameters", "type": "object", - "description": "", + "description": "Parameters to use in Vsearch processes", "default": "", "properties": { "vsearch_minseqlength": { @@ -322,6 +337,9 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/umi_parameters" + }, { "$ref": "#/definitions/alignment_parameters" }, From 46535d8574a9deb8f3854bb8cd19612cee132e4a Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Thu, 6 Apr 2023 09:57:15 +0200 Subject: [PATCH 12/45] update vsearch/sort and add sequence obtention from umi --- conf/modules.config | 6 +++ modules.json | 2 +- modules/nf-core/vsearch/sort/main.nf | 4 +- workflows/crisprseq.nf | 58 +++++++++++++++++++++++++--- 4 files changed, 61 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f4a637a7..e07f587b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -111,6 +111,12 @@ process { ext.args3 = '--clusters' } + + withName: VSEARCH_SORT { + ext.args = '--topn 1' + ext.prefix = { "${fasta.baseName}_top" } + } + withName: MERGING_SUMMARY { publishDir = [ path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, diff --git a/modules.json b/modules.json index afa23421..5fe66a88 100644 --- a/modules.json +++ b/modules.json @@ -80,7 +80,7 @@ }, "vsearch/sort": { "branch": "master", - "git_sha": "2729a013f2c8930ea39d747e2fa589ca89d201ee", + "git_sha": "e7801603532df26b4bb4ef324ca2c39f7a4d0eee", "installed_by": ["modules"] } } diff --git a/modules/nf-core/vsearch/sort/main.nf b/modules/nf-core/vsearch/sort/main.nf index 5f410025..a6f06535 100644 --- a/modules/nf-core/vsearch/sort/main.nf +++ b/modules/nf-core/vsearch/sort/main.nf @@ -12,8 +12,8 @@ process VSEARCH_SORT { val sort_arg output: - tuple val(meta), path("*_sorted.fasta"), emit: fasta - path "versions.yml" , emit: versions + tuple val(meta), path("*.fasta"), emit: fasta + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 2cd4d084..3271407a 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -75,6 +75,7 @@ include { BOWTIE2_BUILD } from '../modules/nf-co include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI } from '../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' @@ -99,6 +100,20 @@ def umi_to_sequence(cluster) { return id + "\n" + sequence } +def umi_to_sequence_centroid(cluster) { + cluster.withReader { source -> + String line + while ( line=source.readLine() ) { + if (line.startsWith(">")) { + sequence = (line =~ /;seq=(.*$)/)[0][1] + id = (line =~ /(>.*?);/)[0][1] + } + } + } + id = id.replace(">", ">centroid_") + return id + "\n" + sequence +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -319,9 +334,9 @@ workflow CRISPRSEQ { .transpose() .branch{ meta, cluster -> - single: cluster.countFasta() == 1 + single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1 return [meta, cluster] - cluster: cluster.countFasta() > 1 + cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1 return [meta, cluster] } .set{ ch_umi_bysize } @@ -339,11 +354,43 @@ workflow CRISPRSEQ { // - // MODULE: Obtain longest read in cluster + // MODULE: Obtain most abundant UMI in cluster + // + VSEARCH_SORT( + ch_umi_bysize.cluster, + Channel.value("--sortbysize") + ) + + // Get the correspondent fasta sequencences from top cluster sequences + // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps + VSEARCH_SORT.out.fasta + .map{ meta, fasta -> + fasta_line = umi_to_sequence_centroid(fasta) + [meta, fasta.baseName, fasta_line] + } + .collectFile() { meta, name, fasta -> + [ "{$name}.fasta", fasta ] + } + .set{ ch_top_clusters_sequence } + + // Get the correspondent fasta sequencences from UMI clusters + ch_umi_bysize.cluster + .map{ meta, cluster -> + fasta_line = umi_to_sequence(cluster) + [meta, cluster.baseName, fasta_line] + } + .collectFile() { meta, name, fasta -> + [ "{$name}_sequences.fasta", fasta ] + } + .set{ ch_clusters_sequence } + + // + // MODULE: Mapping with minimap2 // - //VSEARCH_SORT( + // Map each cluster against the top read (most abundant UMI) in the cluster + MINIMAP2_ALIGN_UMI ( - //) + ) /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: @@ -351,7 +398,6 @@ workflow CRISPRSEQ { Modules to implement: - top_read polishing: minimap2, racon consensus join_reads From 75584e27a306eb4735f6e73a1121b47cfec0c050 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Thu, 6 Apr 2023 10:11:14 +0200 Subject: [PATCH 13/45] add first minimap step for UMI consensus --- conf/modules.config | 5 +++++ workflows/crisprseq.nf | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index e07f587b..a4922cb5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -125,6 +125,11 @@ process { ] } + withName: MINIMAP2_ALIGN_UMI { + ext.args = '-x map-ont' + ext.prefix = { "${reads.baseName}_cycle1" } + } + withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 3271407a..cf4072fa 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -384,14 +384,24 @@ workflow CRISPRSEQ { } .set{ ch_clusters_sequence } + ch_clusters_sequence + .join(ch_top_clusters_sequence) + .set{ ch_cluster_reference } + // // MODULE: Mapping with minimap2 // // Map each cluster against the top read (most abundant UMI) in the cluster MINIMAP2_ALIGN_UMI ( - + ch_clusters_sequence + .join(ch_top_clusters_sequence), + false, //output in paf format + false, + false ) + + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: From 419962829886c593251a9c5389d3d69502e724ce Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Thu, 6 Apr 2023 10:12:25 +0200 Subject: [PATCH 14/45] install new module racon --- modules.json | 5 ++++ modules/nf-core/racon/main.nf | 38 +++++++++++++++++++++++++ modules/nf-core/racon/meta.yml | 52 ++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 modules/nf-core/racon/main.nf create mode 100644 modules/nf-core/racon/meta.yml diff --git a/modules.json b/modules.json index 5fe66a88..78a82edf 100644 --- a/modules.json +++ b/modules.json @@ -62,6 +62,11 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "racon": { + "branch": "master", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "installed_by": ["modules"] + }, "samtools/index": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf new file mode 100644 index 00000000..f8ed3691 --- /dev/null +++ b/modules/nf-core/racon/main.nf @@ -0,0 +1,38 @@ +process RACON { + tag "$meta.id" + label 'process_high' + + conda "bioconda::racon=1.4.20" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/racon:1.4.20--h9a82719_1' : + 'quay.io/biocontainers/racon:1.4.20--h9a82719_1' }" + + input: + tuple val(meta), path(reads), path(assembly), path(paf) + + output: + tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + racon -t "$task.cpus" \\ + "${reads}" \\ + "${paf}" \\ + $args \\ + "${assembly}" > \\ + ${prefix}_assembly_consensus.fasta + + gzip -n ${prefix}_assembly_consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + racon: \$( racon --version 2>&1 | sed 's/^.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/racon/meta.yml b/modules/nf-core/racon/meta.yml new file mode 100644 index 00000000..2e7737d9 --- /dev/null +++ b/modules/nf-core/racon/meta.yml @@ -0,0 +1,52 @@ +name: racon +description: Consensus module for raw de novo DNA assembly of long uncorrected reads +keywords: + - assembly + - pacbio + - nanopore + - polish +tools: + - racon: + description: Ultrafast consensus module for raw de novo genome assembly of long uncorrected reads. + homepage: https://github.com/lbcb-sci/racon + documentation: https://github.com/lbcb-sci/racon + tool_dev_url: https://github.com/lbcb-sci/racon + doi: 10.1101/gr.214270.116 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files. Racon expects single end reads + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + - assembly: + type: file + description: Genome assembly to be improved + pattern: "*.{fasta,fa}" + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - improved_assembly: + type: file + description: Improved genome assembly + pattern: "*_assembly_consensus.fasta.gz" + +authors: + - "@avantonder" From e9adce876216ba6dde4f99ca12983b8815a5ee54 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Thu, 6 Apr 2023 13:57:58 +0200 Subject: [PATCH 15/45] add racon --- conf/modules.config | 17 +++++++- workflows/crisprseq.nf | 88 +++++++++++++++++++++++++++++++++++------- 2 files changed, 91 insertions(+), 14 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a4922cb5..94c79519 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -125,11 +125,26 @@ process { ] } - withName: MINIMAP2_ALIGN_UMI { + withName: MINIMAP2_ALIGN_UMI_1 { ext.args = '-x map-ont' ext.prefix = { "${reads.baseName}_cycle1" } } + withName: MINIMAP2_ALIGN_UMI_2 { + ext.args = '-x map-ont' + ext.prefix = { "${reads.baseName}_cycle2" } + } + + withName: RACON_1 { + ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming' + ext.prefix = { "${reads.baseName}_cycle1" } + } + + withName: RACON_2 { + ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming' + ext.prefix = { "${reads.baseName}_cycle2" } + } + withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index cf4072fa..f2f352a7 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -70,12 +70,15 @@ include { CAT_FASTQ } from '../modules/nf-co include { SEQTK_SEQ } from '../modules/nf-core/seqtk/seq/main' include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' include { VSEARCH_SORT } from '../modules/nf-core/vsearch/sort/main' +include { RACON as RACON_1 } from '../modules/nf-core/racon/main' +include { RACON as RACON_2 } from '../modules/nf-core/racon/main' include { BOWTIE2_ALIGN } from '../modules/nf-core/bowtie2/align/main' include { BOWTIE2_BUILD } from '../modules/nf-core/bowtie2/build/main' include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main' -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1 } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' @@ -343,16 +346,26 @@ workflow CRISPRSEQ { // Get the correspondent fasta sequencences from single clusters ch_umi_bysize.single + .tap{ meta_channel } .map{ meta, cluster -> fasta_line = umi_to_sequence(cluster) [meta, cluster.baseName, fasta_line] } .collectFile() { meta, name, fasta -> - [ "{$name}_consensus.fasta", fasta ] + [ "${name}_consensus.fasta", fasta ] + } + .map{ new_file -> + [new_file.baseName, new_file] + } + .join(meta_channel + .map { meta, original_file -> + ["${original_file.baseName}_consensus", meta] + }) + .map{ file_name, new_file, meta -> + [meta, new_file] } .set{ ch_single_clusters_consensus } - // // MODULE: Obtain most abundant UMI in cluster // @@ -364,35 +377,56 @@ workflow CRISPRSEQ { // Get the correspondent fasta sequencences from top cluster sequences // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps VSEARCH_SORT.out.fasta + .tap{ meta_channel_2 } .map{ meta, fasta -> fasta_line = umi_to_sequence_centroid(fasta) [meta, fasta.baseName, fasta_line] } .collectFile() { meta, name, fasta -> - [ "{$name}.fasta", fasta ] + [ "${name}.fasta", fasta ] + } + .map{ new_file -> + [new_file.baseName[0..-5], new_file] // Substring is removing "_top" added by VSEARCH_SORT + } + .join(meta_channel_2 + .map { meta, original_file -> + ["${original_file.baseName}"[0..-5], meta] // Substring is removing "_top" added by VSEARCH_SORT + }) + .map{ file_name, new_file, meta -> + [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map } .set{ ch_top_clusters_sequence } // Get the correspondent fasta sequencences from UMI clusters ch_umi_bysize.cluster + .tap{ meta_channel_3 } .map{ meta, cluster -> fasta_line = umi_to_sequence(cluster) [meta, cluster.baseName, fasta_line] } .collectFile() { meta, name, fasta -> - [ "{$name}_sequences.fasta", fasta ] + [ "${name}_sequences.fasta", fasta ] + } + .map{ new_file -> + [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile + } + .join(meta_channel_3 + .map { meta, original_file -> + ["${original_file.baseName}", meta] + }) + .map{ file_name, new_file, meta -> + [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map } .set{ ch_clusters_sequence } - ch_clusters_sequence - .join(ch_top_clusters_sequence) - .set{ ch_cluster_reference } + // Cluster consensus & polishing + // Two cycles of minimap2 + racon // - // MODULE: Mapping with minimap2 + // MODULE: Mapping with minimap2 - cycle 1 // // Map each cluster against the top read (most abundant UMI) in the cluster - MINIMAP2_ALIGN_UMI ( + MINIMAP2_ALIGN_UMI_1 ( ch_clusters_sequence .join(ch_top_clusters_sequence), false, //output in paf format @@ -400,7 +434,35 @@ workflow CRISPRSEQ { false ) + // + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 + // + RACON_1 ( + ch_clusters_sequence + .join(ch_top_clusters_sequence) + .join(MINIMAP2_ALIGN_UMI_1.out.paf) + ) + // + // MODULE: Mapping with minimap2 - cycle 2 + // + // Map each cluster against the top read (most abundant UMI) in the cluster + MINIMAP2_ALIGN_UMI_2 ( + ch_clusters_sequence + .join(RACON_1.out.improved_assembly), + false, //output in paf format + false, + false + ) + + // + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 + // + RACON_2 ( + ch_clusters_sequence + .join(RACON_1.out.improved_assembly) + .join(MINIMAP2_ALIGN_UMI_2.out.paf) + ) /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: @@ -561,9 +623,9 @@ workflow CRISPRSEQ { // // MODULE: Parse cigar to find edits // - CIGAR_PARSER ( - ch_to_parse_cigar - ) + //CIGAR_PARSER ( + // ch_to_parse_cigar + //) // From a85fc451e22a22a234ef32197a6585fa74d4b6c0 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 5 May 2023 09:03:50 +0200 Subject: [PATCH 16/45] fix bug returning only one sequence from UMI cluster and filter UMI alignments to ignore empty alignments --- workflows/crisprseq.nf | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index f2f352a7..5c7d1355 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -93,14 +93,16 @@ include { SAMTOOLS_INDEX } from '../modules/nf-co def umi_to_sequence(cluster) { cluster.withReader { source -> String line + sequences = "" while ( line=source.readLine() ) { if (line.startsWith(">")) { sequence = (line =~ /;seq=(.*$)/)[0][1] id = (line =~ /(>.*?);/)[0][1] + sequences = sequences + id + "\n" + sequence + "\n" } } } - return id + "\n" + sequence + return sequences } def umi_to_sequence_centroid(cluster) { @@ -434,13 +436,18 @@ workflow CRISPRSEQ { false ) + // Only continue with clusters that have aligned sequences + MINIMAP2_ALIGN_UMI_1.out.paf + .filter{ it[1].countLines() > 0 } + .set{ ch_minimap_1 } + // - // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1 // RACON_1 ( ch_clusters_sequence .join(ch_top_clusters_sequence) - .join(MINIMAP2_ALIGN_UMI_1.out.paf) + .join(ch_minimap_1) ) // @@ -455,13 +462,18 @@ workflow CRISPRSEQ { false ) + // Only continue with clusters that have aligned sequences + MINIMAP2_ALIGN_UMI_2.out.paf + .filter{ it[1].countLines() > 0 } + .set{ ch_minimap_2 } + // // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 // RACON_2 ( ch_clusters_sequence .join(RACON_1.out.improved_assembly) - .join(MINIMAP2_ALIGN_UMI_2.out.paf) + .join(ch_minimap_2) ) /* From 424ee03534b1a9070fded4e2154b3f0142e73dae Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 5 May 2023 09:04:57 +0200 Subject: [PATCH 17/45] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 196d4589..9b303fc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add UMI clustering to crisprseq-edition + ### Fixed ### Dependencies From 07e21ffa215afa5dbd20a84dc429007403b85e1c Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 5 May 2023 10:15:53 +0200 Subject: [PATCH 18/45] add test_umi tests to CI --- .github/workflows/ci.yml | 23 +++++++++++++++++++++++ CHANGELOG.md | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6c9113a..e858ddf9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,3 +41,26 @@ jobs: # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + + umis: + name: Run pipeline with UMI clustering test data + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/crisprseq') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "22.10.1" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@v3 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Run pipeline with UMI clustering test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_umis,docker --outdir ./results diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b303fc1..ecad079f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add UMI clustering to crisprseq-edition +- Add UMI clustering to crisprseq-edition ([#24](https://github.com/nf-core/crisprseq/pull/24)) ### Fixed From 4ff306e7f08ca03721bcbd904d4ffa5b58c31130 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 5 May 2023 11:10:33 +0200 Subject: [PATCH 19/45] install medaka --- modules.json | 5 ++++ modules/nf-core/medaka/main.nf | 40 ++++++++++++++++++++++++++++ modules/nf-core/medaka/meta.yml | 47 +++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 modules/nf-core/medaka/main.nf create mode 100644 modules/nf-core/medaka/meta.yml diff --git a/modules.json b/modules.json index 78a82edf..7e1984c3 100644 --- a/modules.json +++ b/modules.json @@ -46,6 +46,11 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "medaka": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "minimap2/align": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf new file mode 100644 index 00000000..d60915ee --- /dev/null +++ b/modules/nf-core/medaka/main.nf @@ -0,0 +1,40 @@ +process MEDAKA { + tag "$meta.id" + label 'process_high' + + conda "bioconda::medaka=1.4.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/medaka:1.4.4--py38h130def0_0' : + 'biocontainers/medaka:1.4.4--py38h130def0_0' }" + + input: + tuple val(meta), path(reads), path(assembly) + + output: + tuple val(meta), path("*.fa.gz"), emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + medaka_consensus \\ + -t $task.cpus \\ + $args \\ + -i $reads \\ + -d $assembly \\ + -o ./ + + mv consensus.fasta ${prefix}.fa + + gzip -n ${prefix}.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/medaka/meta.yml b/modules/nf-core/medaka/meta.yml new file mode 100644 index 00000000..ed124d61 --- /dev/null +++ b/modules/nf-core/medaka/meta.yml @@ -0,0 +1,47 @@ +name: medaka +description: A tool to create consensus sequences and variant calls from nanopore sequencing data +keywords: + - assembly + - polishing + - nanopore +tools: + - medaka: + description: Neural network sequence error correction. + homepage: https://nanoporetech.github.io/medaka/index.html + documentation: https://nanoporetech.github.io/medaka/index.html + tool_dev_url: https://github.com/nanoporetech/medaka + + licence: ["Mozilla Public License 2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input nanopore fasta/FastQ files + pattern: "*.{fasta,fa,fastq,fastq.gz,fq,fq.gz}" + - assembly: + type: file + description: Genome assembly + pattern: "*.{fasta,fa}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - assembly: + type: file + description: Polished genome assembly + pattern: "*.fa.gz" + +authors: + - "@avantonder" From ea19682feca0e71714f4c744b32a842f6920f155 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Fri, 5 May 2023 11:19:57 +0200 Subject: [PATCH 20/45] install samtools/faidx --- modules.json | 5 +++ modules/nf-core/samtools/faidx/main.nf | 44 +++++++++++++++++++++++ modules/nf-core/samtools/faidx/meta.yml | 47 +++++++++++++++++++++++++ workflows/crisprseq.nf | 4 ++- 4 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/samtools/faidx/main.nf create mode 100644 modules/nf-core/samtools/faidx/meta.yml diff --git a/modules.json b/modules.json index 7e1984c3..34cbfba7 100644 --- a/modules.json +++ b/modules.json @@ -72,6 +72,11 @@ "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, + "samtools/faidx": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "samtools/index": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 00000000..4dd0e5b0 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.fai"), emit: fai + tuple val(meta), path ("*.gzi"), emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 00000000..fe2fe9a1 --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,47 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 5c7d1355..267ee346 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -476,13 +476,15 @@ workflow CRISPRSEQ { .join(ch_minimap_2) ) + + + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: Modules to implement: - polishing: minimap2, racon consensus join_reads fa2fq From 9bf010576b4d80c030ce39ee04d2cf9f5e474918 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 8 May 2023 16:19:44 +0200 Subject: [PATCH 21/45] run faidx --- workflows/crisprseq.nf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 267ee346..98404560 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -80,6 +80,7 @@ include { MINIMAP2_ALIGN } from '../modules/nf-co include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1 } from '../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' @@ -477,7 +478,12 @@ workflow CRISPRSEQ { ) - + // + // MODULE: Convert reference .fa to .fai + // + SAMTOOLS_FAIDX ( + RACON_2.out.improved_assembly + ) /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: From ebf6fd5a202fad494357ecc7ae436c3890ca0791 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 10:36:50 +0200 Subject: [PATCH 22/45] add minimap and samtools for mini_align process --- conf/modules.config | 8 +++ modules.json | 5 ++ modules/local/modified_minimap2.nf | 47 ++++++++++++++++++ modules/nf-core/minimap2/index/main.nf | 34 +++++++++++++ modules/nf-core/minimap2/index/meta.yml | 40 +++++++++++++++ workflows/crisprseq.nf | 66 +++++++++++++++++++------ 6 files changed, 185 insertions(+), 15 deletions(-) create mode 100644 modules/local/modified_minimap2.nf create mode 100644 modules/nf-core/minimap2/index/main.nf create mode 100644 modules/nf-core/minimap2/index/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 94c79519..9a9c9978 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -145,6 +145,14 @@ process { ext.prefix = { "${reads.baseName}_cycle2" } } + WithName: MINIMAP2_INDEX { + ext.args = '-I 16G -x map-ont' + } + + withName: MINIMAP2_ALIGN_CONSENSUS_1 { + ext.args = '-x map-ont --secondary=no --MD' + } + withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, diff --git a/modules.json b/modules.json index 34cbfba7..456ae16b 100644 --- a/modules.json +++ b/modules.json @@ -57,6 +57,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, + "minimap2/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba", diff --git a/modules/local/modified_minimap2.nf b/modules/local/modified_minimap2.nf new file mode 100644 index 00000000..12e12374 --- /dev/null +++ b/modules/local/modified_minimap2.nf @@ -0,0 +1,47 @@ +process MODIFIED_MINIMAP2 { + tag "$meta.id" + label 'process_medium' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads), path(reference) + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_output = bam_format ? "-a | samtools view -@ ${task.cpus} -T ${reference} -F 2308 -bS - | samtools sort -@ ${task.cpus} -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + "${reference ?: reads}" \\ + "$reads" \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf new file mode 100644 index 00000000..7a1bb227 --- /dev/null +++ b/modules/nf-core/minimap2/index/main.nf @@ -0,0 +1,34 @@ +process MINIMAP2_INDEX { + label 'process_medium' + + // Note: the versions here need to match the versions used in minimap2/align + conda "bioconda::minimap2=2.24" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.24--h7132678_1' : + 'biocontainers/minimap2:2.24--h7132678_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.mmi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml new file mode 100644 index 00000000..b58f35c6 --- /dev/null +++ b/modules/nf-core/minimap2/index/meta.yml @@ -0,0 +1,40 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 98404560..d9cdacf6 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -41,17 +41,18 @@ include { INPUT_CHECK } from '../subworkflows/local/input_c // // MODULE // -include { FIND_ADAPTERS } from '../modules/local/find_adapters' -include { EXTRACT_UMIS } from '../modules/local/extract_umis' -include { SEQ_TO_FILE as SEQ_TO_FILE_REF } from '../modules/local/seq_to_file' -include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' -include { ORIENT_REFERENCE } from '../modules/local/orient_reference' -include { CIGAR_PARSER } from '../modules/local/cigar_parser' -include { MERGING_SUMMARY } from '../modules/local/merging_summary' -include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' -include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' -include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' -include { DUMMY_FINAL_UMI } from '../modules/local/dummy_final_umi' +include { FIND_ADAPTERS } from '../modules/local/find_adapters' +include { EXTRACT_UMIS } from '../modules/local/extract_umis' +include { SEQ_TO_FILE as SEQ_TO_FILE_REF } from '../modules/local/seq_to_file' +include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' +include { ORIENT_REFERENCE } from '../modules/local/orient_reference' +include { CIGAR_PARSER } from '../modules/local/cigar_parser' +include { MERGING_SUMMARY } from '../modules/local/merging_summary' +include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' +include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' +include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' +include { DUMMY_FINAL_UMI } from '../modules/local/dummy_final_umi' +include { MODIFIED_MINIMAP2 as MINIMAP2_ALIGN_CONSENSUS_1 } from '../modules/local/modified_minimap2' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -81,8 +82,10 @@ include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1 } from '../modules/nf-co include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { MINIMAP2_INDEX } from '../modules/nf-core/minimap2/index/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' -include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_CONSENSUS_1 } from '../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_ALIGNMENT } from '../modules/nf-core/samtools/index/main' /* @@ -485,6 +488,39 @@ workflow CRISPRSEQ { RACON_2.out.improved_assembly ) + + // + // MODULE: Indexing the reference file + // + MINIMAP2_INDEX ( + RACON_2.out.improved_assembly + ) + + // + // MODULE: Alignment to obtain the consensus of an UMI cluster + // + MINIMAP2_ALIGN_CONSENSUS_1 { + ch_clusters_sequence + .join(MINIMAP2_INDEX.out.index), + true, + false, + true + } + + // + // MODULE: Obtain .bam.bai files + // + SAMTOOLS_INDEX_CONSENSUS_1 ( + ch_mapped_bam + ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_CONSENSUS_1.out.versions) + + + // + // MODULE: + // + + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: @@ -574,10 +610,10 @@ workflow CRISPRSEQ { // // MODULE: Obtain .bam.bai files // - SAMTOOLS_INDEX ( + SAMTOOLS_INDEX_ALIGNMENT ( ch_mapped_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_ALIGNMENT.out.versions) // // MODULE: Obtain a new reference with the template modification @@ -613,7 +649,7 @@ workflow CRISPRSEQ { ch_versions = ch_versions.mix(MINIMAP2_ALIGN_TEMPLATE.out.versions) ch_mapped_bam - .join(SAMTOOLS_INDEX.out.bai) + .join(SAMTOOLS_INDEX_ALIGNMENT.out.bai) .join(ORIENT_REFERENCE.out.reference) .join(INPUT_CHECK.out.protospacer .map { From d883f1529f7900bc50d26ee0b95034745eacda62 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 10:48:33 +0200 Subject: [PATCH 23/45] remove mini_align as it's now performed by medaka_consensus --- conf/modules.config | 8 ----- modules/local/modified_minimap2.nf | 47 ----------------------------- workflows/crisprseq.nf | 48 +++++------------------------- 3 files changed, 7 insertions(+), 96 deletions(-) delete mode 100644 modules/local/modified_minimap2.nf diff --git a/conf/modules.config b/conf/modules.config index 9a9c9978..94c79519 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -145,14 +145,6 @@ process { ext.prefix = { "${reads.baseName}_cycle2" } } - WithName: MINIMAP2_INDEX { - ext.args = '-I 16G -x map-ont' - } - - withName: MINIMAP2_ALIGN_CONSENSUS_1 { - ext.args = '-x map-ont --secondary=no --MD' - } - withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, diff --git a/modules/local/modified_minimap2.nf b/modules/local/modified_minimap2.nf deleted file mode 100644 index 12e12374..00000000 --- a/modules/local/modified_minimap2.nf +++ /dev/null @@ -1,47 +0,0 @@ -process MODIFIED_MINIMAP2 { - tag "$meta.id" - label 'process_medium' - - // Note: the versions here need to match the versions used in the mulled container below and minimap2/index - conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : - 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" - - input: - tuple val(meta), path(reads), path(reference) - val bam_format - val cigar_paf_format - val cigar_bam - - output: - tuple val(meta), path("*.paf"), optional: true, emit: paf - tuple val(meta), path("*.bam"), optional: true, emit: bam - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def bam_output = bam_format ? "-a | samtools view -@ ${task.cpus} -T ${reference} -F 2308 -bS - | samtools sort -@ ${task.cpus} -o ${prefix}.bam" : "-o ${prefix}.paf" - def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' - def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' - """ - minimap2 \\ - $args \\ - -t $task.cpus \\ - "${reference ?: reads}" \\ - "$reads" \\ - $cigar_paf \\ - $set_cigar_bam \\ - $bam_output - - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - minimap2: \$(minimap2 --version 2>&1) - END_VERSIONS - """ -} diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index d9cdacf6..f4869d43 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -52,7 +52,6 @@ include { CLUSTERING_SUMMARY } from '../modules/loc include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' include { DUMMY_FINAL_UMI } from '../modules/local/dummy_final_umi' -include { MODIFIED_MINIMAP2 as MINIMAP2_ALIGN_CONSENSUS_1 } from '../modules/local/modified_minimap2' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -83,9 +82,9 @@ include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-co include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' include { MINIMAP2_INDEX } from '../modules/nf-core/minimap2/index/main' +include { MEDAKA as MEDAKA_1 } from '../modules/nf-core/medaka/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_CONSENSUS_1 } from '../modules/nf-core/samtools/index/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_ALIGNMENT } from '../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' /* @@ -482,44 +481,11 @@ workflow CRISPRSEQ { // - // MODULE: Convert reference .fa to .fai + // MODULE: Obtain a consensus sequence // - SAMTOOLS_FAIDX ( - RACON_2.out.improved_assembly - ) - - - // - // MODULE: Indexing the reference file - // - MINIMAP2_INDEX ( - RACON_2.out.improved_assembly - ) - - // - // MODULE: Alignment to obtain the consensus of an UMI cluster - // - MINIMAP2_ALIGN_CONSENSUS_1 { - ch_clusters_sequence - .join(MINIMAP2_INDEX.out.index), - true, - false, - true - } + MEDAKA_1 ( - // - // MODULE: Obtain .bam.bai files - // - SAMTOOLS_INDEX_CONSENSUS_1 ( - ch_mapped_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_CONSENSUS_1.out.versions) - - - // - // MODULE: - // - /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: @@ -610,10 +576,10 @@ workflow CRISPRSEQ { // // MODULE: Obtain .bam.bai files // - SAMTOOLS_INDEX_ALIGNMENT ( + SAMTOOLS_INDEX ( ch_mapped_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_ALIGNMENT.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions) // // MODULE: Obtain a new reference with the template modification @@ -649,7 +615,7 @@ workflow CRISPRSEQ { ch_versions = ch_versions.mix(MINIMAP2_ALIGN_TEMPLATE.out.versions) ch_mapped_bam - .join(SAMTOOLS_INDEX_ALIGNMENT.out.bai) + .join(SAMTOOLS_INDEX.out.bai) .join(ORIENT_REFERENCE.out.reference) .join(INPUT_CHECK.out.protospacer .map { From de43d353abbb9d8467c958ab8254a3e35500884b Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 11:12:52 +0200 Subject: [PATCH 24/45] add medaka process --- conf/modules.config | 5 +++++ nextflow.config | 1 + nextflow_schema.json | 6 ++++++ workflows/crisprseq.nf | 7 ++++--- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 94c79519..c9078b2e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -145,6 +145,11 @@ process { ext.prefix = { "${reads.baseName}_cycle2" } } + withName: MEDAKA { + ext.args = '-m r941_min_high_g303' + ext.prefix = { "${reads.baseName}_consensus" } + } + withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, diff --git a/nextflow.config b/nextflow.config index 4af7c0b1..cd61a1fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,7 @@ params { // UMI parameters umi_bin_size = 1 + medaka_model = 'r941_min_high_g303' // Vsearch options vsearch_minseqlength = 55 diff --git a/nextflow_schema.json b/nextflow_schema.json index 74b49103..260f215e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -53,6 +53,12 @@ "default": 1, "description": "Minimum size of a UMI cluster.", "fa_icon": "fas fa-sort-amount-down" + }, + "medaka_model": { + "type": "string", + "default": "r941_min_high_g360", + "fa_icon": "fas fa-font", + "description": "Medaka model (-m) to use according to the basecaller used." } }, "fa_icon": "fas fa-layer-group" diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index f4869d43..037f91a0 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -82,7 +82,7 @@ include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-co include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' include { MINIMAP2_INDEX } from '../modules/nf-core/minimap2/index/main' -include { MEDAKA as MEDAKA_1 } from '../modules/nf-core/medaka/main' +include { MEDAKA } from '../modules/nf-core/medaka/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' @@ -483,8 +483,9 @@ workflow CRISPRSEQ { // // MODULE: Obtain a consensus sequence // - MEDAKA_1 ( - + MEDAKA ( + ch_clusters_sequence + .join(RACON_2.out.improved_assembly) ) /* From 353a8ba92e58793cefb6b1306a12ff8feebbdef9 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 11:28:48 +0200 Subject: [PATCH 25/45] join all umi consensus in a file --- workflows/crisprseq.nf | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 037f91a0..b5b8a527 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -488,14 +488,30 @@ workflow CRISPRSEQ { .join(RACON_2.out.improved_assembly) ) + // Collect all consensus UMI sequences into one single file per sample + MEDAKA.out.assembly + .tap{ meta_channel_4 } + .collectFile() { meta, file -> + [ "${meta.id}_consensus.fasta", file ] + } + .join(meta_channel_4 + .map{ meta, consensus -> + ["${consensus.baseName}", meta] + } + ) + .map{ name, file, meta -> + [meta - meta.subMap('cluster_id'), file] + } + .set{ ch_umi_consensus } + + ch_umi_consensus.view() + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: Modules to implement: - consensus - join_reads fa2fq */ From ac9f2674e4f3adc979c9c2e1ac19026ccfdd4da2 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 13:27:46 +0200 Subject: [PATCH 26/45] convert fasta to fastq after umi consensus --- conf/modules.config | 4 ++++ nextflow.config | 1 + workflows/crisprseq.nf | 28 ++++++++++++++++++---------- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c9078b2e..3b461fae 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -150,6 +150,10 @@ process { ext.prefix = { "${reads.baseName}_consensus" } } + withName: SEQTK_SEQ_FATOFQ { + ext.args = '-F "#"' + } + withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, diff --git a/nextflow.config b/nextflow.config index cd61a1fa..e1bdf5d4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -108,6 +108,7 @@ profiles { docker { docker.enabled = true docker.userEmulation = true + docker.registry = 'quay.io' singularity.enabled = false podman.enabled = false shifter.enabled = false diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index b5b8a527..a874058e 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -67,7 +67,8 @@ include { MULTIQC } from '../modules/nf-co include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { PEAR } from '../modules/nf-core/pear/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' -include { SEQTK_SEQ } from '../modules/nf-core/seqtk/seq/main' +include { SEQTK_SEQ as SEQTK_SEQ_MASK } from '../modules/nf-core/seqtk/seq/main' +include { SEQTK_SEQ as SEQTK_SEQ_FATOFQ } from '../modules/nf-core/seqtk/seq/main' include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' include { VSEARCH_SORT } from '../modules/nf-core/vsearch/sort/main' include { RACON as RACON_1 } from '../modules/nf-core/racon/main' @@ -288,10 +289,10 @@ workflow CRISPRSEQ { // // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80 // - SEQTK_SEQ ( + SEQTK_SEQ_MASK ( ch_trimmed ) - ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions) + ch_versions = ch_versions.mix(SEQTK_SEQ_MASK.out.versions) // @@ -301,7 +302,7 @@ workflow CRISPRSEQ { ch_cat_fastq.paired .mix(ch_cat_fastq.single) .join(PEAR.out.assembled, remainder: true) - .join(SEQTK_SEQ.out.fastx) + .join(SEQTK_SEQ_MASK.out.fastx) .join(CUTADAPT.out.log) .map { meta, reads, assembled, masked, trimmed -> if (assembled == null) { @@ -318,7 +319,7 @@ workflow CRISPRSEQ { // MODULE: Extract UMI sequences // EXTRACT_UMIS ( - SEQTK_SEQ.out.fastx + SEQTK_SEQ_MASK.out.fastx ) @@ -506,6 +507,13 @@ workflow CRISPRSEQ { ch_umi_consensus.view() + // + // MODULE: Convert fasta to fastq + // + SEQTK_SEQ_FATOFQ ( + ch_umi_consensus + ) + /* The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: @@ -518,14 +526,14 @@ workflow CRISPRSEQ { // Dummy process simulating the last UMi clustering step to obtain clusters as fastq DUMMY_FINAL_UMI { - SEQTK_SEQ.out.fastx + SEQTK_SEQ_MASK.out.fastx } // // MODULE: Summary of clustered reads // CLUSTERING_SUMMARY ( - SEQTK_SEQ.out.fastx + SEQTK_SEQ_MASK.out.fastx .join(MERGING_SUMMARY.out.summary) ) @@ -535,7 +543,7 @@ workflow CRISPRSEQ { // if (params.aligner == "minimap2") { MINIMAP2_ALIGN ( - SEQTK_SEQ.out.fastx + SEQTK_SEQ_MASK.out.fastx .join(ORIENT_REFERENCE.out.reference), true, false, @@ -554,7 +562,7 @@ workflow CRISPRSEQ { ) ch_versions = ch_versions.mix(BWA_INDEX.out.versions) BWA_MEM ( - SEQTK_SEQ.out.fastx, + SEQTK_SEQ_MASK.out.fastx, BWA_INDEX.out.index, true ) @@ -571,7 +579,7 @@ workflow CRISPRSEQ { ) ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions) BOWTIE2_ALIGN ( - SEQTK_SEQ.out.fastx, + SEQTK_SEQ_MASK.out.fastx, BOWTIE2_BUILD.out.index, false, true From be105b4315ea6ac1af3b2c6ffbec7450cafb7a68 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 13:29:17 +0200 Subject: [PATCH 27/45] remove dummi_final_umi process --- modules/local/dummy_final_umi.nf | 55 -------------------------------- workflows/crisprseq.nf | 15 --------- 2 files changed, 70 deletions(-) delete mode 100644 modules/local/dummy_final_umi.nf diff --git a/modules/local/dummy_final_umi.nf b/modules/local/dummy_final_umi.nf deleted file mode 100644 index 64069410..00000000 --- a/modules/local/dummy_final_umi.nf +++ /dev/null @@ -1,55 +0,0 @@ -process DUMMY_FINAL_UMI { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::p7zip==16.02" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : - 'quay.io/biocontainers/p7zip:16.02' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*_clustered.fastq.gz"), emit: dummy - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}_clustered.fastq - - 7za \\ - a \\ - $args \\ - ${prefix}_clustered.fastq.gz \\ - ${prefix}_clustered.fastq - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') - END_VERSIONS - """ -} - - - -process summary_reads{ - - input: - set sampleID, file(summary), file(finalReads) from summaryReport.join(readsSummary) - - output: - set val(sampleID), file(summary) into summaryReportReads - - script: - """ - final_count=`expr \$(cat $finalReads | wc -l) / 4` - echo "clustered-reads," \$final_count >> ${sampleID}_summary.csv - """ - - } diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index a874058e..94c3505e 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -51,7 +51,6 @@ include { MERGING_SUMMARY } from '../modules/loc include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' -include { DUMMY_FINAL_UMI } from '../modules/local/dummy_final_umi' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -514,20 +513,6 @@ workflow CRISPRSEQ { ch_umi_consensus ) - /* - The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: - - - Modules to implement: - - fa2fq - - */ - - // Dummy process simulating the last UMi clustering step to obtain clusters as fastq - DUMMY_FINAL_UMI { - SEQTK_SEQ_MASK.out.fastx - } // // MODULE: Summary of clustered reads From 798b4dbdad68cbc9ecd0e823e4215f56ba4653aa Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 13:34:13 +0200 Subject: [PATCH 28/45] pass final umi consensus tu clustering_summary --- workflows/crisprseq.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 94c3505e..7a603bee 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -518,7 +518,7 @@ workflow CRISPRSEQ { // MODULE: Summary of clustered reads // CLUSTERING_SUMMARY ( - SEQTK_SEQ_MASK.out.fastx + SEQTK_SEQ_FATOFQ.out.fastx .join(MERGING_SUMMARY.out.summary) ) From 8d10395f3c16090de7852cddee4f41f215e444b3 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Tue, 9 May 2023 13:35:03 +0200 Subject: [PATCH 29/45] pass final umi consensus to mapping steps --- workflows/crisprseq.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 7a603bee..c59be2f0 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -528,7 +528,7 @@ workflow CRISPRSEQ { // if (params.aligner == "minimap2") { MINIMAP2_ALIGN ( - SEQTK_SEQ_MASK.out.fastx + SEQTK_SEQ_FATOFQ.out.fastx .join(ORIENT_REFERENCE.out.reference), true, false, @@ -547,7 +547,7 @@ workflow CRISPRSEQ { ) ch_versions = ch_versions.mix(BWA_INDEX.out.versions) BWA_MEM ( - SEQTK_SEQ_MASK.out.fastx, + SEQTK_SEQ_FATOFQ.out.fastx, BWA_INDEX.out.index, true ) @@ -564,7 +564,7 @@ workflow CRISPRSEQ { ) ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions) BOWTIE2_ALIGN ( - SEQTK_SEQ_MASK.out.fastx, + SEQTK_SEQ_FATOFQ.out.fastx, BOWTIE2_BUILD.out.index, false, true From 42302dd92307af0279876f25fcff39041ea2a23f Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Thu, 11 May 2023 14:21:07 +0200 Subject: [PATCH 30/45] unzip input medaka --- conf/modules.config | 3 --- modules.json | 3 ++- modules/nf-core/medaka/main.nf | 3 +++ modules/nf-core/medaka/medaka.diff | 15 +++++++++++++++ 4 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 modules/nf-core/medaka/medaka.diff diff --git a/conf/modules.config b/conf/modules.config index 3b461fae..478a8e39 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -152,9 +152,6 @@ process { withName: SEQTK_SEQ_FATOFQ { ext.args = '-F "#"' - } - - withName: DUMMY_FINAL_UMI { publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, mode: params.publish_dir_mode, diff --git a/modules.json b/modules.json index 456ae16b..0b50ffed 100644 --- a/modules.json +++ b/modules.json @@ -49,7 +49,8 @@ "medaka": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/medaka/medaka.diff" }, "minimap2/align": { "branch": "master", diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf index d60915ee..a4eaa998 100644 --- a/modules/nf-core/medaka/main.nf +++ b/modules/nf-core/medaka/main.nf @@ -21,6 +21,9 @@ process MEDAKA { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ + gzip -cdf $reads > ${reads.baseName} + gzip -cdf $assembly > ${assembly.baseName} + medaka_consensus \\ -t $task.cpus \\ $args \\ diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff new file mode 100644 index 00000000..1c9d16f9 --- /dev/null +++ b/modules/nf-core/medaka/medaka.diff @@ -0,0 +1,15 @@ +Changes in module 'nf-core/medaka' +--- modules/nf-core/medaka/main.nf ++++ modules/nf-core/medaka/main.nf +@@ -21,6 +21,9 @@ + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ ++ gzip -cdf $reads > ${reads.baseName} ++ gzip -cdf $assembly > ${assembly.baseName} ++ + medaka_consensus \\ + -t $task.cpus \\ + $args \\ + +************************************************************ From a9969062e3bc61ad6071707094d376f3ae79810e Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 15 May 2023 13:12:02 +0200 Subject: [PATCH 31/45] make removing overrepresented sequences optional --- nextflow.config | 3 ++ nextflow_schema.json | 17 ++++++++++ workflows/crisprseq.nf | 72 +++++++++++++++++++++++------------------- 3 files changed, 60 insertions(+), 32 deletions(-) diff --git a/nextflow.config b/nextflow.config index e1bdf5d4..db3154b2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,9 @@ params { input = null aligner = 'minimap2' + // Pipeline steps + overrepresented = false + // UMI parameters umi_bin_size = 1 medaka_model = 'r941_min_high_g303' diff --git a/nextflow_schema.json b/nextflow_schema.json index 260f215e..a0e469ea 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,6 +5,20 @@ "description": "Pipeline for the analysis of crispr data", "type": "object", "definitions": { + "popeline_steps": { + "title": "Popeline steps", + "type": "object", + "description": "Alternative pipeline steps to include in the analysis.", + "default": "", + "properties": { + "overrepresented": { + "type": "boolean", + "fa_icon": "fas fa-sort-numeric-up-alt", + "description": "Trim overrepresented sequences from reads (cutadapt)" + } + }, + "fa_icon": "fas fa-shoe-prints" + }, "input_output_options": { "title": "Input/output options", "type": "object", @@ -340,6 +354,9 @@ } }, "allOf": [ + { + "$ref": "#/definitions/popeline_steps" + }, { "$ref": "#/definitions/input_output_options" }, diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index c59be2f0..3a3a1baf 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -249,41 +249,49 @@ workflow CRISPRSEQ { ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - // - // MODULE: Find overrepresented sequences - FIND_ADAPTERS ( - FASTQC.out.zip - ) - .adapters - .join(ch_pear_fastq) - .groupTuple(by: [0]) - // Separate samples by containing overrepresented sequences or not - .branch { - meta, adapter_seqs, reads -> - no_adapters: adapter_seqs[0].size() == 0 - return [ meta, reads[0] ] - adapters : adapter_seqs[0].size() > 0 - return [ meta, reads[0], adapter_seqs[0] ] - } - .set { ch_adapter_seqs } + ch_trimmed = Channel.empty() + if (params.overrepresented) { + // + // MODULE: Find overrepresented sequences + FIND_ADAPTERS ( + FASTQC.out.zip + ) + .adapters + .join(ch_pear_fastq) + .groupTuple(by: [0]) + // Separate samples by containing overrepresented sequences or not + .branch { + meta, adapter_seqs, reads -> + no_adapters: adapter_seqs[0].size() == 0 + return [ meta, reads[0] ] + adapters : adapter_seqs[0].size() > 0 + return [ meta, reads[0], adapter_seqs[0] ] + } + .set { ch_adapter_seqs } - // - // MODULE: Trim adapter sequences - // - CUTADAPT ( - ch_adapter_seqs.adapters - ) - ch_versions = ch_versions.mix(CUTADAPT.out.versions) - ch_adapter_seqs.no_adapters - .mix(CUTADAPT.out.reads) - .groupTuple(by: [0]) - .map { - meta, fastq -> - return [ meta, fastq.flatten() ] + // + // MODULE: Trim adapter sequences + // + CUTADAPT ( + ch_adapter_seqs.adapters + ) + ch_versions = ch_versions.mix(CUTADAPT.out.versions) + + ch_adapter_seqs.no_adapters + .mix(CUTADAPT.out.reads) + .groupTuple(by: [0]) + .map { + meta, fastq -> + return [ meta, fastq.flatten() ] + } + .set{ ch_trimmed } + } else { + ch_trimmed = ch_pear_fastq } - .set{ ch_trimmed } + + ch_trimmed.view() // // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80 @@ -302,7 +310,7 @@ workflow CRISPRSEQ { .mix(ch_cat_fastq.single) .join(PEAR.out.assembled, remainder: true) .join(SEQTK_SEQ_MASK.out.fastx) - .join(CUTADAPT.out.log) + .join(( params.overrepresented ? CUTADAPT.out.log : Channel.value("null") )) .map { meta, reads, assembled, masked, trimmed -> if (assembled == null) { dummy = file('null') From 2706b7aab4f0cbdee54ae75f9e6c2fe227534f74 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Mon, 15 May 2023 14:39:35 +0200 Subject: [PATCH 32/45] unzip medaka input --- modules/nf-core/medaka/main.nf | 26 +++++++++++++------ modules/nf-core/medaka/medaka.diff | 41 +++++++++++++++++++++++++++--- workflows/crisprseq.nf | 36 +++++++++++++------------- 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf index a4eaa998..b02119b7 100644 --- a/modules/nf-core/medaka/main.nf +++ b/modules/nf-core/medaka/main.nf @@ -11,8 +11,8 @@ process MEDAKA { tuple val(meta), path(reads), path(assembly) output: - tuple val(meta), path("*.fa.gz"), emit: assembly - path "versions.yml" , emit: versions + tuple val(meta), path("*.fasta.gz"), emit: assembly + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -21,19 +21,29 @@ process MEDAKA { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - gzip -cdf $reads > ${reads.baseName} - gzip -cdf $assembly > ${assembly.baseName} + if [[ "${reads.extension}" == "gz" ]]; then + gzip -df $reads + reads=$reads.baseName + else + reads=$reads + fi + if [[ "${assembly.extension}" == "gz" ]]; then + gzip -df $assembly + assembly=$assembly.baseName + else + assembly=$assembly.baseName + fi medaka_consensus \\ -t $task.cpus \\ $args \\ - -i $reads \\ - -d $assembly \\ + -i \$reads \\ + -d \$assembly \\ -o ./ - mv consensus.fasta ${prefix}.fa + mv consensus.fasta ${prefix}.fasta - gzip -n ${prefix}.fa + gzip -n ${prefix}.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff index 1c9d16f9..d0b2fb34 100644 --- a/modules/nf-core/medaka/medaka.diff +++ b/modules/nf-core/medaka/medaka.diff @@ -1,15 +1,50 @@ Changes in module 'nf-core/medaka' --- modules/nf-core/medaka/main.nf +++ modules/nf-core/medaka/main.nf -@@ -21,6 +21,9 @@ +@@ -11,8 +11,8 @@ + tuple val(meta), path(reads), path(assembly) + + output: +- tuple val(meta), path("*.fa.gz"), emit: assembly +- path "versions.yml" , emit: versions ++ tuple val(meta), path("*.fasta.gz"), emit: assembly ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when +@@ -21,16 +21,29 @@ def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ -+ gzip -cdf $reads > ${reads.baseName} -+ gzip -cdf $assembly > ${assembly.baseName} ++ if [[ "${reads.extension}" == "gz" ]]; then ++ gzip -df $reads ++ reads=$reads.baseName ++ else ++ reads=$reads ++ fi ++ if [[ "${assembly.extension}" == "gz" ]]; then ++ gzip -df $assembly ++ assembly=$assembly.baseName ++ else ++ assembly=$assembly.baseName ++ fi + medaka_consensus \\ -t $task.cpus \\ $args \\ +- -i $reads \\ +- -d $assembly \\ ++ -i \$reads \\ ++ -d \$assembly \\ + -o ./ + +- mv consensus.fasta ${prefix}.fa ++ mv consensus.fasta ${prefix}.fasta + +- gzip -n ${prefix}.fa ++ gzip -n ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": ************************************************************ diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 3a3a1baf..ca2fd24c 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -291,7 +291,6 @@ workflow CRISPRSEQ { ch_trimmed = ch_pear_fastq } - ch_trimmed.view() // // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80 @@ -389,49 +388,50 @@ workflow CRISPRSEQ { // Get the correspondent fasta sequencences from top cluster sequences // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps - VSEARCH_SORT.out.fasta - .tap{ meta_channel_2 } + VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta] + .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta] .map{ meta, fasta -> fasta_line = umi_to_sequence_centroid(fasta) - [meta, fasta.baseName, fasta_line] + [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...] } .collectFile() { meta, name, fasta -> - [ "${name}.fasta", fasta ] + [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta } .map{ new_file -> - [new_file.baseName[0..-5], new_file] // Substring is removing "_top" added by VSEARCH_SORT + [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta] } .join(meta_channel_2 .map { meta, original_file -> - ["${original_file.baseName}"[0..-5], meta] // Substring is removing "_top" added by VSEARCH_SORT - }) + ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]] + }) // [sample, sample_top.fasta, [id:sample_id, ...]] .map{ file_name, new_file, meta -> - [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map + [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] } - .set{ ch_top_clusters_sequence } + .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] // Get the correspondent fasta sequencences from UMI clusters - ch_umi_bysize.cluster - .tap{ meta_channel_3 } + ch_umi_bysize.cluster // [[id:sample_id, ...], sample] + .tap{ meta_channel_3 } // [[id:sample_id, ...], sample] .map{ meta, cluster -> fasta_line = umi_to_sequence(cluster) - [meta, cluster.baseName, fasta_line] + [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...] } .collectFile() { meta, name, fasta -> - [ "${name}_sequences.fasta", fasta ] + [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta } .map{ new_file -> - [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile + [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta] } .join(meta_channel_3 .map { meta, original_file -> - ["${original_file.baseName}", meta] - }) + ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]] + }) // [sample, sample_sequences.fasta, [id:sample_id, ...]] .map{ file_name, new_file, meta -> - [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map + [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta] } .set{ ch_clusters_sequence } + // Cluster consensus & polishing // Two cycles of minimap2 + racon From c0974bf45a81955744935dc7979ab7a61aa06b5e Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Mon, 15 May 2023 15:45:57 +0200 Subject: [PATCH 33/45] fix medaka error --- conf/modules.config | 2 +- modules/nf-core/medaka/main.nf | 8 +++----- modules/nf-core/medaka/medaka.diff | 13 ++++++------- workflows/crisprseq.nf | 10 ++++++++-- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 478a8e39..fe53493a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -147,7 +147,7 @@ process { withName: MEDAKA { ext.args = '-m r941_min_high_g303' - ext.prefix = { "${reads.baseName}_consensus" } + ext.prefix = { "${reads.baseName}_medakaConsensus" } } withName: SEQTK_SEQ_FATOFQ { diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf index b02119b7..80e4c065 100644 --- a/modules/nf-core/medaka/main.nf +++ b/modules/nf-core/medaka/main.nf @@ -11,8 +11,8 @@ process MEDAKA { tuple val(meta), path(reads), path(assembly) output: - tuple val(meta), path("*.fasta.gz"), emit: assembly - path "versions.yml" , emit: versions + tuple val(meta), path("*_medakaConsensus.fasta"), emit: assembly + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -31,7 +31,7 @@ process MEDAKA { gzip -df $assembly assembly=$assembly.baseName else - assembly=$assembly.baseName + assembly=$assembly fi medaka_consensus \\ @@ -43,8 +43,6 @@ process MEDAKA { mv consensus.fasta ${prefix}.fasta - gzip -n ${prefix}.fasta - cat <<-END_VERSIONS > versions.yml "${task.process}": medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' ) diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff index d0b2fb34..7f8752ac 100644 --- a/modules/nf-core/medaka/medaka.diff +++ b/modules/nf-core/medaka/medaka.diff @@ -7,12 +7,12 @@ Changes in module 'nf-core/medaka' output: - tuple val(meta), path("*.fa.gz"), emit: assembly - path "versions.yml" , emit: versions -+ tuple val(meta), path("*.fasta.gz"), emit: assembly -+ path "versions.yml" , emit: versions ++ tuple val(meta), path("*_medakaConsensus.fasta"), emit: assembly ++ path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when -@@ -21,16 +21,29 @@ +@@ -21,16 +21,27 @@ def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ @@ -26,7 +26,7 @@ Changes in module 'nf-core/medaka' + gzip -df $assembly + assembly=$assembly.baseName + else -+ assembly=$assembly.baseName ++ assembly=$assembly + fi + medaka_consensus \\ @@ -39,10 +39,9 @@ Changes in module 'nf-core/medaka' -o ./ - mv consensus.fasta ${prefix}.fa -+ mv consensus.fasta ${prefix}.fasta - +- - gzip -n ${prefix}.fa -+ gzip -n ${prefix}.fasta ++ mv consensus.fasta ${prefix}.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index ca2fd24c..0c64cc95 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -499,12 +499,19 @@ workflow CRISPRSEQ { // Collect all consensus UMI sequences into one single file per sample MEDAKA.out.assembly .tap{ meta_channel_4 } + .map{ meta, file -> + file_content = file.getText() + [meta, file_content] // [[id:sample_id, ...], consensus_content] + } .collectFile() { meta, file -> [ "${meta.id}_consensus.fasta", file ] } + .map{ new_file -> + [new_file.baseName, new_file] + } .join(meta_channel_4 .map{ meta, consensus -> - ["${consensus.baseName}", meta] + ["${meta.id}_consensus", meta] } ) .map{ name, file, meta -> @@ -512,7 +519,6 @@ workflow CRISPRSEQ { } .set{ ch_umi_consensus } - ch_umi_consensus.view() // // MODULE: Convert fasta to fastq From 565e6e89a5c1b91895d516ccfedf94e6fdb8ab41 Mon Sep 17 00:00:00 2001 From: Julia Mir Pedrol Date: Mon, 22 May 2023 17:38:18 +0200 Subject: [PATCH 34/45] hopefully fix umis bug --- workflows/crisprseq.nf | 52 +++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/workflows/crisprseq.nf b/workflows/crisprseq.nf index 0c64cc95..ca4f15bb 100644 --- a/workflows/crisprseq.nf +++ b/workflows/crisprseq.nf @@ -93,15 +93,17 @@ include { SAMTOOLS_INDEX } from '../modules/nf-co ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def umi_to_sequence(cluster) { - cluster.withReader { source -> - String line - sequences = "" - while ( line=source.readLine() ) { - if (line.startsWith(">")) { - sequence = (line =~ /;seq=(.*$)/)[0][1] - id = (line =~ /(>.*?);/)[0][1] - sequences = sequences + id + "\n" + sequence + "\n" +def umi_to_sequence(cluster1) { + String line1 + String sequences = "" + String sequence1 + String id1 + cluster1.withReader { + while ( line1=it.readLine() ) { + if (line1.startsWith(">")) { + sequence1 = (line1 =~ /;seq=(.*$)/)[0][1] + id1 = (line1 =~ /(>.*?);/)[0][1] + sequences = sequences + id1 + "\n" + sequence1 + "\n" } } } @@ -109,17 +111,18 @@ def umi_to_sequence(cluster) { } def umi_to_sequence_centroid(cluster) { - cluster.withReader { source -> - String line - while ( line=source.readLine() ) { + String line + String sequence + String id + cluster.withReader { + while ( line=it.readLine() ) { if (line.startsWith(">")) { sequence = (line =~ /;seq=(.*$)/)[0][1] id = (line =~ /(>.*?);/)[0][1] + return id.replace(">", ">centroid_") + "\n" + sequence } } } - id = id.replace(">", ">centroid_") - return id + "\n" + sequence } @@ -386,6 +389,9 @@ workflow CRISPRSEQ { Channel.value("--sortbysize") ) + //VSEARCH_SORT.out.fasta.dump(tag:'vsearch_sort_output') + //ch_umi_bysize.cluster.dump(tag:'umi_clusters') + // Get the correspondent fasta sequencences from top cluster sequences // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta] @@ -394,9 +400,10 @@ workflow CRISPRSEQ { fasta_line = umi_to_sequence_centroid(fasta) [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...] } - .collectFile() { meta, name, fasta -> + .collectFile(cache:true,sort:true) { meta, name, fasta -> [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta } + .dump(tag:"collectfile output") .map{ new_file -> [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta] } @@ -408,6 +415,7 @@ workflow CRISPRSEQ { [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] } .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] + // Get the correspondent fasta sequencences from UMI clusters ch_umi_bysize.cluster // [[id:sample_id, ...], sample] @@ -416,9 +424,11 @@ workflow CRISPRSEQ { fasta_line = umi_to_sequence(cluster) [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...] } - .collectFile() { meta, name, fasta -> + .dump(tag:'map allreads output') + .collectFile(cache:true,sort:true) { meta, name, fasta -> [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta } + .dump(tag:"collectfile allreads output") .map{ new_file -> [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta] } @@ -447,11 +457,21 @@ workflow CRISPRSEQ { false ) + MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered') + // Only continue with clusters that have aligned sequences MINIMAP2_ALIGN_UMI_1.out.paf .filter{ it[1].countLines() > 0 } .set{ ch_minimap_1 } + //ch_clusters_sequence.dump(tag:'ch_umi_clusters') + //ch_top_clusters_sequence.dump(tag:'ch_vsearch_top') + ch_minimap_1.dump(tag:'minimap') + //ch_clusters_sequence + // .join(ch_top_clusters_sequence) + // .join(ch_minimap_1) + // .dump(tag:'clusters_top_alignment') + // // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1 // From 6c57e3eb69d8978f1ad62eefd3b1f72dd69555b3 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 12 Jun 2023 09:09:24 +0200 Subject: [PATCH 35/45] fix typo --- nextflow_schema.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6a8265bb..a56a38f2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,8 +5,8 @@ "description": "Pipeline for the analysis of crispr data", "type": "object", "definitions": { - "popeline_steps": { - "title": "Popeline steps", + "pipeline_steps": { + "title": "pipeline steps", "type": "object", "description": "Alternative pipeline steps to include in the analysis.", "default": "", @@ -401,7 +401,7 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/popeline_steps" + "$ref": "#/definitions/pipeline_steps" }, { "$ref": "#/definitions/screening_parameters" From 6dfba880712f8b87687cfc0eac8550669132f3a5 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 12 Jun 2023 09:12:25 +0200 Subject: [PATCH 36/45] update schema --- nextflow_schema.json | 51 +++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index a56a38f2..27b9911c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,20 +5,6 @@ "description": "Pipeline for the analysis of crispr data", "type": "object", "definitions": { - "pipeline_steps": { - "title": "pipeline steps", - "type": "object", - "description": "Alternative pipeline steps to include in the analysis.", - "default": "", - "properties": { - "overrepresented": { - "type": "boolean", - "fa_icon": "fas fa-sort-numeric-up-alt", - "description": "Trim overrepresented sequences from reads (cutadapt)" - } - }, - "fa_icon": "fas fa-shoe-prints" - }, "input_output_options": { "title": "Input/output options", "type": "object", @@ -45,7 +31,8 @@ "analysis": { "type": "string", "fa_icon": "fas fa-cog", - "enum": ["screening", "targeted"] + "enum": ["screening", "targeted"], + "description": "Type of analysis to perform" }, "email": { "type": "string", @@ -61,6 +48,20 @@ } } }, + "targeted_pipeline_steps": { + "title": "targeted pipeline steps", + "type": "object", + "description": "Alternative pipeline steps to include in the targeted analysis.", + "default": "", + "properties": { + "overrepresented": { + "type": "boolean", + "fa_icon": "fas fa-sort-numeric-up-alt", + "description": "Trim overrepresented sequences from reads (cutadapt)" + } + }, + "fa_icon": "fas fa-shoe-prints" + }, "umi_parameters": { "title": "UMI parameters", "type": "object", @@ -162,12 +163,12 @@ "min_reads": { "type": "number", "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample", - "default": 30.0 + "default": 30 }, "min_targeted_genes": { "type": "number", "description": "Minimal number of different genes targeted by sgRNAs in a biased segment in order for the corresponding counts to be corrected", - "default": 3.0 + "default": 3 } } }, @@ -401,10 +402,7 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/pipeline_steps" - }, - { - "$ref": "#/definitions/screening_parameters" + "$ref": "#/definitions/targeted_pipeline_steps" }, { "$ref": "#/definitions/umi_parameters" @@ -415,6 +413,9 @@ { "$ref": "#/definitions/vsearch_parameters" }, + { + "$ref": "#/definitions/screening_parameters" + }, { "$ref": "#/definitions/reference_genome_options" }, @@ -427,5 +428,11 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "schema_ignore_params": { + "type": "string", + "default": "genomes" + } + } } From 4ed1882857aa389b187e1547b7e03921dc9e3925 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 12 Jun 2023 09:14:48 +0200 Subject: [PATCH 37/45] add analysis param in test_umis --- conf/test_umis.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/test_umis.config b/conf/test_umis.config index 42eab8f8..cfa11b55 100644 --- a/conf/test_umis.config +++ b/conf/test_umis.config @@ -20,7 +20,8 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv' + analysis = 'targeted' // Aligner aligner = 'minimap2' From 0482c066d1314ebbebaede0f5ff6d3c588a2c4d8 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Mon, 12 Jun 2023 14:00:31 +0200 Subject: [PATCH 38/45] add docs for UMIs --- README.md | 18 +++++++-- conf/modules.config | 10 +++++ docs/output_targeted.md | 69 +++++++++++++++++++++++++++++---- docs/usage_targeted.md | 4 +- workflows/crisprseq_targeted.nf | 11 ------ 5 files changed, 87 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index ae00a3f4..8c46e6c6 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,19 @@ For crispr targeted : 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 3. Adapter trimming ([`Cutadapt`](http://dx.doi.org/10.14806/ej.17.1.200)) 4. Quality filtering ([`Seqtk`](https://github.com/lh3/seqtk)) -5. Read mapping: +5. UMI clustering (optional): + 1. Extract UMI sequences (Python script) + 2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch)) + 3. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch)) + 4. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2)) + 5. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon)) + 6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`) + 7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html)) +6. Read mapping: - ([`minimap2`](https://github.com/lh3/minimap2), _default_) - ([`bwa`](http://bio-bwa.sourceforge.net/)) - ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) -6. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/)) +7. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/)) For crispr screening : @@ -56,7 +64,9 @@ For crispr screening : 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash - nextflow run nf-core/crisprseq -profile test,YOURPROFILE --outdir + nextflow run nf-core/crisprseq -profile test_screening,YOURPROFILE --outdir + # or + nextflow run nf-core/crisprseq -profile test_targeted,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. @@ -69,7 +79,7 @@ For crispr screening : 4. Start running your own analysis! ```bash - nextflow run nf-core/crisprseq --input samplesheet.csv --outdir -profile + nextflow run nf-core/crisprseq --input samplesheet.csv --analysis --outdir -profile ``` ## Documentation diff --git a/conf/modules.config b/conf/modules.config index e28805c2..50a27953 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -162,11 +162,21 @@ process { withName: MINIMAP2_ALIGN_UMI_1 { ext.args = '-x map-ont' ext.prefix = { "${reads.baseName}_cycle1" } + publishDir = [ + path: { "${params.outdir}/minimap2_umi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: MINIMAP2_ALIGN_UMI_2 { ext.args = '-x map-ont' ext.prefix = { "${reads.baseName}_cycle2" } + publishDir = [ + path: { "${params.outdir}/minimap2_umi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: RACON_1 { diff --git a/docs/output_targeted.md b/docs/output_targeted.md index 1cb835c1..c84aa887 100644 --- a/docs/output_targeted.md +++ b/docs/output_targeted.md @@ -11,14 +11,18 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [Preprocessing](#preprocessing) - - [Sequences](#sequences) - Input sequence preparation (reference, protospacer, template) + - [sequences](#sequences) - Input sequence preparation (reference, protospacer, template) - [cat](#cat) - Concatenate sample fastq files if required - - [Pear](#pear) - Join double-end reads if required - - [FastQC](#fastqc) - Read Quality Control - - [Adapters](#adapters) - Find adapters (Overrepresented sequences) in reads - - [Cutadapt](#cutadapt) - Trim adapters - - [Seqtk](#seqtk) - Mask low-quality bases - + - [pear](#pear) - Join double-end reads if required + - [fastqc](#fastqc) - Read Quality Control + - [adapters](#adapters) - Find adapters (Overrepresented sequences) in reads + - [cutadapt](#cutadapt) - Trim adapters + - [seqtk](#seqtk) - Mask low-quality bases +- [UMI clustering](#umi-clustering) + - [vsearch](#vsearch) + - [minimap2 umi](#minimap2-umi) + - [racon](#racon) + - [medaka](#medaka) - [Mapping](#mapping) - [minimap2](#minimap2) - Mapping reads to reference - [BWA](#bwa) - Mapping reads to reference @@ -133,7 +137,56 @@ If multiple libraries/runs have been provided for the same sample in the input s [Seqtk](https://github.com/lh3/seqtk) masks (converts to Ns) bases with quality lower than 20 and removes sequences shorter than 80 bases. - +## UMI clustering + +### vsearch + +
+Output files + +- `vsearch/` + - `*_clusters*`: Contains all UMI sequences which clustered together. + - `*_clusters*_top.fasta`: Contains the most abundant UMI sequence from the cluster. + +
+ +[VSEARCH](https://github.com/torognes/vsearch) is a versatile open-source tool which includes chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. `vsearch/clsuter` can cluster sequences using a single-pass, greedy centroid-based clustering algorithm. `vsearch/sort` can sort fasta entries by decreasing abundance (`--sortbysize`) or sequence length (`--sortbylength`). + +### minimap2_umi + +
+Output files + +- `minimap2_umi/` + - `*_sequences_clycle[1,2].paf`: Alignment of the cluster sequences against the top UMi sequence in paf format. + +
+ +[Minimap2](https://github.com/lh3/minimap2) is a sequence alignment program that aligns DNA sequences against a reference database. + +### racon + +
+Output files + +- `racon/` + - `*_sequences_clycle[1,2]_assembly_consensus.fasta.gz`: Consensus sequence obtained from the cluster multiple sequence alignment. + +
+ +[Racon](https://github.com/lbcb-sci/racon) is an ultrafast consensus module for raw de novo genome assembly of long uncorrected reads. + +### medaka + +
+Output files + +- `medaka/` + - `*_medakaConsensus.fasta`: Final consensus sequence of each UMI cluster. Obtained after two rounds of minimap2 + racon. + +
+ +[Medaka](https://nanoporetech.github.io/medaka/index.html) is a tool to create consensus sequences and variant calls from nanopore sequencing data. ## Mapping diff --git a/docs/usage_targeted.md b/docs/usage_targeted.md index f6af7ff1..8f9d759d 100644 --- a/docs/usage_targeted.md +++ b/docs/usage_targeted.md @@ -53,10 +53,10 @@ An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv) ## Running the pipeline -The typical command for running the pipeline is as follows: +The typical command for running the pipeline for targeted CIRSPR analysis is as follows: ```bash -nextflow run nf-core/crisprseq --input samplesheet.csv --outdir -profile docker +nextflow run nf-core/crisprseq --input samplesheet.csv --analysis targeted --outdir -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index a81797b7..ba068d8b 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -389,9 +389,6 @@ workflow CRISPRSEQ_TARGETED { Channel.value("--sortbysize") ) - //VSEARCH_SORT.out.fasta.dump(tag:'vsearch_sort_output') - //ch_umi_bysize.cluster.dump(tag:'umi_clusters') - // Get the correspondent fasta sequencences from top cluster sequences // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta] @@ -464,14 +461,6 @@ workflow CRISPRSEQ_TARGETED { .filter{ it[1].countLines() > 0 } .set{ ch_minimap_1 } - //ch_clusters_sequence.dump(tag:'ch_umi_clusters') - //ch_top_clusters_sequence.dump(tag:'ch_vsearch_top') - ch_minimap_1.dump(tag:'minimap') - //ch_clusters_sequence - // .join(ch_top_clusters_sequence) - // .join(ch_minimap_1) - // .dump(tag:'clusters_top_alignment') - // // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1 // From 4ab6d0b6d7781abd51cc60fa6935c62774b07814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= Date: Tue, 27 Jun 2023 15:38:58 +0200 Subject: [PATCH 39/45] Apply suggestions from code review --- CHANGELOG.md | 2 +- workflows/crisprseq_targeted.nf | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bf11ddc..2e8564c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add UMI clustering to crisprseq-edition ([#24](https://github.com/nf-core/crisprseq/pull/24)) +- Add UMI clustering to crisprseq-targeted ([#24](https://github.com/nf-core/crisprseq/pull/24)) ### Fixed diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index ba068d8b..5265168e 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -678,9 +678,9 @@ workflow CRISPRSEQ_TARGETED { // // MODULE: Parse cigar to find edits // - //CIGAR_PARSER ( - // ch_to_parse_cigar - //) + CIGAR_PARSER ( + ch_to_parse_cigar + ) // From ae81ac0e873a9c146b2251a9fd48eeb58d5e7674 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 27 Jun 2023 15:53:29 +0200 Subject: [PATCH 40/45] add --umi_clustering parameter to opt-in the umi clustering steps --- nextflow.config | 1 + nextflow_schema.json | 20 +- workflows/crisprseq_targeted.nf | 387 ++++++++++++++++---------------- 3 files changed, 209 insertions(+), 199 deletions(-) diff --git a/nextflow.config b/nextflow.config index 77899b0e..952b570d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { // Pipeline steps overrepresented = false + umi_clustering = false // UMI parameters umi_bin_size = 1 diff --git a/nextflow_schema.json b/nextflow_schema.json index 1cc44ce2..31256db7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -58,6 +58,11 @@ "type": "boolean", "fa_icon": "fas fa-sort-numeric-up-alt", "description": "Trim overrepresented sequences from reads (cutadapt)" + }, + "umi_clustering": { + "type": "boolean", + "fa_icon": "fas fa-layer-group", + "description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps." } }, "fa_icon": "fas fa-shoe-prints" @@ -393,6 +398,13 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "schema_ignore_params": { + "type": "string", + "default": "genomes", + "description": "Ignore JSON schema validation of the following params", + "fa_icon": "fas fa-ban", + "hidden": true } } } @@ -428,11 +440,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "schema_ignore_params": { - "type": "string", - "default": "genomes" - } - } + ] } diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 5265168e..b7f0e025 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -324,224 +324,227 @@ workflow CRISPRSEQ_TARGETED { } - // - // MODULE: Extract UMI sequences - // - EXTRACT_UMIS ( - SEQTK_SEQ_MASK.out.fastx - ) + if (params.umi_clustering) { + // + // MODULE: Extract UMI sequences + // + EXTRACT_UMIS ( + SEQTK_SEQ_MASK.out.fastx + ) - // - // MODULE: Cluster UMIs - // - VSEARCH_CLUSTER ( - EXTRACT_UMIS.out.fasta - ) + // + // MODULE: Cluster UMIs + // + VSEARCH_CLUSTER ( + EXTRACT_UMIS.out.fasta + ) - // Obtain a file with UBS (UBI bin size) and UMI ID - VSEARCH_CLUSTER.out.clusters - .transpose() - .collectFile( storeDir:params.outdir ) { - it -> - [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ] - } + // Obtain a file with UBS (UBI bin size) and UMI ID + VSEARCH_CLUSTER.out.clusters + .transpose() + .collectFile( storeDir:params.outdir ) { + it -> + [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ] + } - // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences - VSEARCH_CLUSTER.out.clusters - .transpose() - .branch{ - meta, cluster -> - single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1 - return [meta, cluster] - cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1 - return [meta, cluster] - } - .set{ ch_umi_bysize } - - // Get the correspondent fasta sequencences from single clusters - ch_umi_bysize.single - .tap{ meta_channel } - .map{ meta, cluster -> - fasta_line = umi_to_sequence(cluster) - [meta, cluster.baseName, fasta_line] - } - .collectFile() { meta, name, fasta -> - [ "${name}_consensus.fasta", fasta ] - } - .map{ new_file -> - [new_file.baseName, new_file] - } - .join(meta_channel - .map { meta, original_file -> - ["${original_file.baseName}_consensus", meta] - }) - .map{ file_name, new_file, meta -> - [meta, new_file] - } - .set{ ch_single_clusters_consensus } + // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences + VSEARCH_CLUSTER.out.clusters + .transpose() + .branch{ + meta, cluster -> + single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1 + return [meta, cluster] + cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1 + return [meta, cluster] + } + .set{ ch_umi_bysize } + + // Get the correspondent fasta sequencences from single clusters + ch_umi_bysize.single + .tap{ meta_channel } + .map{ meta, cluster -> + fasta_line = umi_to_sequence(cluster) + [meta, cluster.baseName, fasta_line] + } + .collectFile() { meta, name, fasta -> + [ "${name}_consensus.fasta", fasta ] + } + .map{ new_file -> + [new_file.baseName, new_file] + } + .join(meta_channel + .map { meta, original_file -> + ["${original_file.baseName}_consensus", meta] + }) + .map{ file_name, new_file, meta -> + [meta, new_file] + } + .set{ ch_single_clusters_consensus } - // - // MODULE: Obtain most abundant UMI in cluster - // - VSEARCH_SORT( - ch_umi_bysize.cluster, - Channel.value("--sortbysize") - ) + // + // MODULE: Obtain most abundant UMI in cluster + // + VSEARCH_SORT( + ch_umi_bysize.cluster, + Channel.value("--sortbysize") + ) - // Get the correspondent fasta sequencences from top cluster sequences - // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps - VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta] - .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta] - .map{ meta, fasta -> - fasta_line = umi_to_sequence_centroid(fasta) - [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...] - } - .collectFile(cache:true,sort:true) { meta, name, fasta -> - [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta - } - .dump(tag:"collectfile output") - .map{ new_file -> - [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta] - } - .join(meta_channel_2 - .map { meta, original_file -> - ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]] - }) // [sample, sample_top.fasta, [id:sample_id, ...]] - .map{ file_name, new_file, meta -> - [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] - } - .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] + // Get the correspondent fasta sequencences from top cluster sequences + // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps + VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta] + .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta] + .map{ meta, fasta -> + fasta_line = umi_to_sequence_centroid(fasta) + [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...] + } + .collectFile(cache:true,sort:true) { meta, name, fasta -> + [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta + } + .dump(tag:"collectfile output") + .map{ new_file -> + [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta] + } + .join(meta_channel_2 + .map { meta, original_file -> + ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]] + }) // [sample, sample_top.fasta, [id:sample_id, ...]] + .map{ file_name, new_file, meta -> + [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] + } + .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] - // Get the correspondent fasta sequencences from UMI clusters - ch_umi_bysize.cluster // [[id:sample_id, ...], sample] - .tap{ meta_channel_3 } // [[id:sample_id, ...], sample] - .map{ meta, cluster -> - fasta_line = umi_to_sequence(cluster) - [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...] - } - .dump(tag:'map allreads output') - .collectFile(cache:true,sort:true) { meta, name, fasta -> - [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta - } - .dump(tag:"collectfile allreads output") - .map{ new_file -> - [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta] - } - .join(meta_channel_3 - .map { meta, original_file -> - ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]] - }) // [sample, sample_sequences.fasta, [id:sample_id, ...]] - .map{ file_name, new_file, meta -> - [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta] - } - .set{ ch_clusters_sequence } + // Get the correspondent fasta sequencences from UMI clusters + ch_umi_bysize.cluster // [[id:sample_id, ...], sample] + .tap{ meta_channel_3 } // [[id:sample_id, ...], sample] + .map{ meta, cluster -> + fasta_line = umi_to_sequence(cluster) + [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...] + } + .dump(tag:'map allreads output') + .collectFile(cache:true,sort:true) { meta, name, fasta -> + [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta + } + .dump(tag:"collectfile allreads output") + .map{ new_file -> + [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta] + } + .join(meta_channel_3 + .map { meta, original_file -> + ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]] + }) // [sample, sample_sequences.fasta, [id:sample_id, ...]] + .map{ file_name, new_file, meta -> + [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta] + } + .set{ ch_clusters_sequence } - // Cluster consensus & polishing - // Two cycles of minimap2 + racon + // Cluster consensus & polishing + // Two cycles of minimap2 + racon - // - // MODULE: Mapping with minimap2 - cycle 1 - // - // Map each cluster against the top read (most abundant UMI) in the cluster - MINIMAP2_ALIGN_UMI_1 ( - ch_clusters_sequence - .join(ch_top_clusters_sequence), - false, //output in paf format - false, - false - ) + // + // MODULE: Mapping with minimap2 - cycle 1 + // + // Map each cluster against the top read (most abundant UMI) in the cluster + MINIMAP2_ALIGN_UMI_1 ( + ch_clusters_sequence + .join(ch_top_clusters_sequence), + false, //output in paf format + false, + false + ) - MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered') + MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered') - // Only continue with clusters that have aligned sequences - MINIMAP2_ALIGN_UMI_1.out.paf - .filter{ it[1].countLines() > 0 } - .set{ ch_minimap_1 } + // Only continue with clusters that have aligned sequences + MINIMAP2_ALIGN_UMI_1.out.paf + .filter{ it[1].countLines() > 0 } + .set{ ch_minimap_1 } - // - // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1 - // - RACON_1 ( - ch_clusters_sequence - .join(ch_top_clusters_sequence) - .join(ch_minimap_1) - ) + // + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1 + // + RACON_1 ( + ch_clusters_sequence + .join(ch_top_clusters_sequence) + .join(ch_minimap_1) + ) - // - // MODULE: Mapping with minimap2 - cycle 2 - // - // Map each cluster against the top read (most abundant UMI) in the cluster - MINIMAP2_ALIGN_UMI_2 ( - ch_clusters_sequence - .join(RACON_1.out.improved_assembly), - false, //output in paf format - false, - false - ) + // + // MODULE: Mapping with minimap2 - cycle 2 + // + // Map each cluster against the top read (most abundant UMI) in the cluster + MINIMAP2_ALIGN_UMI_2 ( + ch_clusters_sequence + .join(RACON_1.out.improved_assembly), + false, //output in paf format + false, + false + ) - // Only continue with clusters that have aligned sequences - MINIMAP2_ALIGN_UMI_2.out.paf - .filter{ it[1].countLines() > 0 } - .set{ ch_minimap_2 } + // Only continue with clusters that have aligned sequences + MINIMAP2_ALIGN_UMI_2.out.paf + .filter{ it[1].countLines() > 0 } + .set{ ch_minimap_2 } - // - // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 - // - RACON_2 ( - ch_clusters_sequence - .join(RACON_1.out.improved_assembly) - .join(ch_minimap_2) - ) + // + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 + // + RACON_2 ( + ch_clusters_sequence + .join(RACON_1.out.improved_assembly) + .join(ch_minimap_2) + ) - // - // MODULE: Obtain a consensus sequence - // - MEDAKA ( - ch_clusters_sequence - .join(RACON_2.out.improved_assembly) - ) + // + // MODULE: Obtain a consensus sequence + // + MEDAKA ( + ch_clusters_sequence + .join(RACON_2.out.improved_assembly) + ) - // Collect all consensus UMI sequences into one single file per sample - MEDAKA.out.assembly - .tap{ meta_channel_4 } - .map{ meta, file -> - file_content = file.getText() - [meta, file_content] // [[id:sample_id, ...], consensus_content] - } - .collectFile() { meta, file -> - [ "${meta.id}_consensus.fasta", file ] - } - .map{ new_file -> - [new_file.baseName, new_file] - } - .join(meta_channel_4 - .map{ meta, consensus -> - ["${meta.id}_consensus", meta] + // Collect all consensus UMI sequences into one single file per sample + MEDAKA.out.assembly + .tap{ meta_channel_4 } + .map{ meta, file -> + file_content = file.getText() + [meta, file_content] // [[id:sample_id, ...], consensus_content] } - ) - .map{ name, file, meta -> - [meta - meta.subMap('cluster_id'), file] - } - .set{ ch_umi_consensus } + .collectFile() { meta, file -> + [ "${meta.id}_consensus.fasta", file ] + } + .map{ new_file -> + [new_file.baseName, new_file] + } + .join(meta_channel_4 + .map{ meta, consensus -> + ["${meta.id}_consensus", meta] + } + ) + .map{ name, file, meta -> + [meta - meta.subMap('cluster_id'), file] + } + .set{ ch_umi_consensus } - // - // MODULE: Convert fasta to fastq - // - SEQTK_SEQ_FATOFQ ( - ch_umi_consensus - ) + // + // MODULE: Convert fasta to fastq + // + SEQTK_SEQ_FATOFQ ( + ch_umi_consensus + ) + } + ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ.out.fastx // // MODULE: Summary of clustered reads // CLUSTERING_SUMMARY ( - SEQTK_SEQ_FATOFQ.out.fastx + ch_preprocess_reads .join(MERGING_SUMMARY.out.summary) ) @@ -551,7 +554,7 @@ workflow CRISPRSEQ_TARGETED { // if (params.aligner == "minimap2") { MINIMAP2_ALIGN_ORIGINAL ( - SEQTK_SEQ_FATOFQ.out.fastx + ch_preprocess_reads .join(ORIENT_REFERENCE.out.reference), true, false, @@ -570,7 +573,7 @@ workflow CRISPRSEQ_TARGETED { ) ch_versions = ch_versions.mix(BWA_INDEX.out.versions) BWA_MEM ( - SEQTK_SEQ_FATOFQ.out.fastx, + ch_preprocess_reads, BWA_INDEX.out.index, true ) @@ -587,7 +590,7 @@ workflow CRISPRSEQ_TARGETED { ) ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions) BOWTIE2_ALIGN ( - SEQTK_SEQ_FATOFQ.out.fastx, + ch_preprocess_reads, BOWTIE2_BUILD.out.index, false, true From 602a52b50c7ea49f471e486598b96619fb993249 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 27 Jun 2023 15:59:35 +0200 Subject: [PATCH 41/45] add pipeline optional step parameters to usage documentation --- docs/usage/targeted.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/usage/targeted.md b/docs/usage/targeted.md index 667d03df..9f58cc45 100644 --- a/docs/usage/targeted.md +++ b/docs/usage/targeted.md @@ -55,6 +55,26 @@ chr6,chr6-61942198-61942498_R1.fastq.gz,,CAA...GGA,TTTTATGATATTTATCTTTT,TTC...CA An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv) has been provided with the pipeline. +## Optional pipeline steps + +### Trimming of overrepresented sequences + +To trim the overrepresented sequences found with FastQC from the reads, use the parameter `--overrepresented`. +Such sequences are not trimmed by default. +When using the `--overrepresented` parameter, cutadapt is used to trim overrepresented sequences from the input FASTQ files. + +### UMI clustering + +If the provided samples were sequenced using umi-molecular identifyers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps. + +1. Extract UMI sequences (Python script) +2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch)) +3. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch)) +4. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2)) +5. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon)) +6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`) +7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html)) + ## Running the pipeline The typical command for running the pipeline is as follows: From 09201ed49df88c4c025fa799eceb73a8e68e80a4 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 27 Jun 2023 15:59:51 +0200 Subject: [PATCH 42/45] add pipeline optional step parameters to usage documentation --- docs/usage/targeted.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage/targeted.md b/docs/usage/targeted.md index 9f58cc45..f319aa56 100644 --- a/docs/usage/targeted.md +++ b/docs/usage/targeted.md @@ -61,11 +61,11 @@ An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv) To trim the overrepresented sequences found with FastQC from the reads, use the parameter `--overrepresented`. Such sequences are not trimmed by default. -When using the `--overrepresented` parameter, cutadapt is used to trim overrepresented sequences from the input FASTQ files. +When using the `--overrepresented` parameter, Cutadapt is used to trim overrepresented sequences from the input FASTQ files. ### UMI clustering -If the provided samples were sequenced using umi-molecular identifyers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps. +If the provided samples were sequenced using umi-molecular identifiers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps. 1. Extract UMI sequences (Python script) 2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch)) From bf99c289b7f769341f2523ae672177bacb3da20f Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 27 Jun 2023 16:04:24 +0200 Subject: [PATCH 43/45] fix alias SEQTK_SEQ_MASK --- conf/modules.config | 2 +- workflows/crisprseq_targeted.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 50a27953..bdfbdd64 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -126,7 +126,7 @@ process { ] } - withName: SEQTK_SEQ { + withName: SEQTK_SEQ_MASK { ext.args = '-q 20 -L 80 -n N' publishDir = [ path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index b7f0e025..8aace856 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -538,7 +538,7 @@ workflow CRISPRSEQ_TARGETED { ) } - ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ.out.fastx + ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ_MASK.out.fastx // // MODULE: Summary of clustered reads From fb1c14fcb8aacebfd4ced525241954f0ea5c3b13 Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Tue, 27 Jun 2023 17:21:47 +0200 Subject: [PATCH 44/45] fix: MERGING_SUMMARY runs when --overrepresented is false --- templates/merging_summary.py | 26 ++++++++++++++++---------- workflows/crisprseq_targeted.nf | 32 ++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/templates/merging_summary.py b/templates/merging_summary.py index 0eb1239a..804a5be3 100755 --- a/templates/merging_summary.py +++ b/templates/merging_summary.py @@ -6,22 +6,28 @@ with gzip.open("${raw_reads[0]}", "rt") as handle: raw_reads_count = len(list(SeqIO.parse(handle, "fastq"))) -if "$assembled_reads" == "null": + +if "$assembled_reads" == "null_a": assembled_reads_count = 0 else: with gzip.open("$assembled_reads", "rt") as handle: assembled_reads_count = len(list(SeqIO.parse(handle, "fastq"))) # Merged reads R1+R2 + with gzip.open("$trimmed_reads", "rt") as handle: trimmed_reads_count = len(list(SeqIO.parse(handle, "fastq"))) # Filtered reads -with open("$trimmed_adapters", "r") as handle: - for line in handle: - if line.startswith("Reads with adapters"): - for field in "".join(line.split(",")).split(): - if field.isdigit(): - adapters_count = field # reads with adapters - if "%" in field: - adapters_percentage = field # percentage of reads with adapters: ex. "(100.0%)" +if "$trimmed_adapters" == "null_t": + adapters_count = 0 + adapters_percentage = "(0.0%)" +else: + with open("$trimmed_adapters", "r") as handle: + for line in handle: + if line.startswith("Reads with adapters"): + for field in "".join(line.split(",")).split(): + if field.isdigit(): + adapters_count = field # reads with adapters + if "%" in field: + adapters_percentage = field # percentage of reads with adapters: ex. "(100.0%)" if "$task.ext.prefix" != "null": prefix = "$task.ext.prefix" @@ -35,7 +41,7 @@ f"merged-reads, {assembled_reads_count} ({round(assembled_reads_count * 100 / raw_reads_count,1)}%)\\n" ) output_file.write(f"reads-with-adapters, {adapters_count} {adapters_percentage}\\n") - if "$assembled_reads" == "null": + if "$assembled_reads" == "null_a": output_file.write( f"quality-filtered-reads, {trimmed_reads_count} ({round(trimmed_reads_count * 100 / raw_reads_count,1)}%)\\n" ) diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index 8aace856..9484ee28 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -303,24 +303,36 @@ workflow CRISPRSEQ_TARGETED { ) ch_versions = ch_versions.mix(SEQTK_SEQ_MASK.out.versions) - - // - // MODULE: Summary of merged reads - // - MERGING_SUMMARY { + if (params.overrepresented) { + ch_cat_fastq.paired + .mix(ch_cat_fastq.single) + .join(PEAR.out.assembled, remainder: true) + .join(SEQTK_SEQ_MASK.out.fastx) + .join(CUTADAPT.out.log) + .set { ch_merging_summary_data } + } else { ch_cat_fastq.paired .mix(ch_cat_fastq.single) .join(PEAR.out.assembled, remainder: true) .join(SEQTK_SEQ_MASK.out.fastx) - .join(( params.overrepresented ? CUTADAPT.out.log : Channel.value("null") )) + .combine(Channel.value("null")) .map { meta, reads, assembled, masked, trimmed -> if (assembled == null) { - dummy = file('null') - return [ meta, reads, dummy, masked, trimmed ] - } else { - return [ meta, reads, assembled, masked, trimmed ] + assembled = file('null_a') + } + if (trimmed == "null") { + trimmed = file('null_t') } + return [ meta, reads, assembled, masked, trimmed ] } + .set { ch_merging_summary_data } + } + + // + // MODULE: Summary of merged reads + // + MERGING_SUMMARY { + ch_merging_summary_data } From 7f9bfda971dfe8eeaf16ee273d8b49dcdae16c7f Mon Sep 17 00:00:00 2001 From: mirpedrol Date: Wed, 28 Jun 2023 08:51:53 +0200 Subject: [PATCH 45/45] add param umi_clustering to test_umis --- conf/test_umis.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/test_umis.config b/conf/test_umis.config index cfa11b55..6efc80d8 100644 --- a/conf/test_umis.config +++ b/conf/test_umis.config @@ -20,8 +20,9 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv' - analysis = 'targeted' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv' + analysis = 'targeted' + umi_clustering = true // Aligner aligner = 'minimap2'