nf-core · apeltzer · Mar 4, 2019 · Feb 1, 2019 · Feb 1, 2019 · Feb 2, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -40,6 +40,10 @@ script:
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --saveReference
   # Run the basic pipeline with single end data (pretending its single end actually)
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd --bwa_index results/reference_genome/bwa_index/bwa_index/
+  # Run the basic pipeline with paired end data without collapsing
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_collapse --saveReference
+    # Run the basic pipeline with paired end data without collapsing nor trimming
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_collapse --skip_trim --saveReference
   # Run the same pipeline testing optional step: fastp, complexity 
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/
   # Test BAM Trimming

diff --git a/docs/usage.md b/docs/usage.md
@@ -237,7 +237,7 @@ Use to set a top-limit for the default time requirement for each process.
 Should be a string in the format integer-unit. eg. `--max_time '2.h'`. If not specified, will be taken from the configuration in the `-profile` flag.
 
 ### `--max_cpus`
-Use to set a top-limit for the default CPU requirement for each process.
+Use to set a top-limit for the default CPU requirement for each **process**. This is not the maximum number of CPUs that can be used for the whole pipeline, but the maximum number of CPUs each program can use for each program submission (known as a process). Do not set this higher than what is available on your workstation or computing node can provide. If you're unsure, ask your local IT administrator for details on compute node capabilities! 
 Should be a string in the format integer-unit. eg. `--max_cpus 1`. If not specified, will be taken from the configuration in the `-profile` flag.
 
 ### `--email`
@@ -279,12 +279,30 @@ This part of the documentation contains a list of user-adjustable parameters in
 
 ## Step skipping parameters
 
-Some of the steps in the pipeline can be executed optionally. If you specify specific steps to be skipped, there won't be any output related to these modules. 
+Some of the steps in the pipeline can be executed optionally. If you specify specific steps to be skipped, there won't be any output related to these modules.
 
 ### `--skip_preseq`
 
 Turns off the computation of library complexity estimation.  
 
+### `--skip_collapse`
+If you have paired-end data, but you don't want to merge them, add the command line argument `--noCollapse`. 
+
+For example
+```bash
+--pairedEnd --skip_collapse  --reads '*.fastq'
+```
+
+### `--skip_trimming`
+
+Turns off the adaptor and quality trimming
+
+For example
+```bash
+--pairedEnd --skip_trimming  --reads '*.fastq'
+```
+
+
 ### `--skip_damage_calculation`
 
 Turns off the DamageProfiler module to compute DNA damage profiles. 

diff --git a/main.nf b/main.nf
@@ -26,32 +26,34 @@ def helpMessage() {
 
     Mandatory arguments:
       --reads                       Path to input data (must be surrounded with quotes)
-      -profile                      Hardware config to use (e.g. standard, docker, singularity, conda, aws). Ask your system admin if unsure, or check documentatoin.
+      -profile                      Institution or personal hardware config to use (e.g. standard, docker, singularity, conda, aws). Ask your system admin if unsure, or check documentation.
       --singleEnd                   Specifies that the input is single end reads (required if not pairedEnd)
-      --pairedEnd                   Specifies that the input is paired end reads (required if not singleend)
+      --pairedEnd                   Specifies that the input is paired end reads (required if not singleEnd)
       --bam                         Specifies that the input is in BAM format
       --fasta                       Path to Fasta reference (required if not iGenome reference)
       --genome                      Name of iGenomes reference (required if not fasta reference)
 
     Input Data Additional Options:
       --snpcapture                  Runs in SNPCapture mode (specify a BED file if you do this!)
 
-    References                      If not specified in the configuration file or you wish to overwrite any of the references.
-      --bwa_index                   Path to BWA index
+    References                      If not specified in the configuration file, or you wish to overwrite any of the references.
+      --bwa_index                   Path to directory containing BWA index files
       --bedfile                     Path to BED file for SNPCapture methods
-      --seq_dict                    Path to sequence dictionary file
-      --fasta_index                 Path to FastA index 
+      --seq_dict                    Path to picard sequence dictionary file (typically ending in '.dict')
+      --fasta_index                 Path to samtools FASTA index (typically ending in '.fai')
       --saveReference               Saves reference genome indices for later reusage
 
     Skipping                        Skip any of the mentioned steps
+      --skip_collapse               Skip merging Forward and Reverse reads together. (Only for pairedEnd samples)
+      --skip_trim                   Skip adaptor and quality trimming
       --skip_preseq
       --skip_damage_calculation
       --skip_qualimap
       --skip_deduplication
 
     Complexity Filtering 
-      --complexity_filtering            Run complexity filtering on FastQ files
-      --complexity_filter_poly_g_min    Specify poly-g min filter (default: 10) for filtering
+      --complexity_filtering            Run poly-G removal on FASTQ files
+      --complexity_filter_poly_g_min    Specify length of poly-g min for clipping to be performed (default: 10)
 
     Clipping / Merging
       --clip_forward_adaptor        Specify adapter sequence to be clipped off (forward)
@@ -61,23 +63,23 @@ def helpMessage() {
       --min_adap_overlap            Specify minimum adapter overlap
 
     BWA Mapping
-      --bwaalnn                     Specify the -n parameter for BWA aln
+      --bwaalnn                     Specify the -n parameter for BWA aln.
       --bwaalnk                     Specify the -k parameter for BWA aln
       --bwaalnl                     Specify the -l parameter for BWA aln
 
     CircularMapper
       --circularmapper              Turn on CircularMapper (CM)
-      --circularextension           Specify the number of bases to extend
+      --circularextension           Specify the number of bases to extend reference by
       --circulartarget              Specify the target chromosome for CM
       --circularfilter              Specify to filter off-target reads
 
     BWA Mem Mapping
-      --bwamem                      Turn on BWA Mem instead of CM/BWA aln for mapping
+      --bwamem                      Turn on BWA Mem instead of BWA aln for mapping
 
     BAM Filtering
-      --bam_discard_unmapped        Discards unmapped reads in either FASTQ or BAM format, depending on choice. 
-      --bam_unmapped_type           Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format (options: discard, bam, fastq, both).
       --bam_mapping_quality_threshold   Minimum mapping quality for reads filter, default 0.
+      --bam_discard_unmapped        Discards unmapped reads in either FASTQ or BAM format, depending on choice (see --bam_unmapped_type).
+      --bam_unmapped_type           Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format (options: discard, bam, fastq, both).
 
     DeDuplication
       --dedupper                    Deduplication method to use (options: dedup, markduplicates). Default: dedup
@@ -101,10 +103,6 @@ def helpMessage() {
       --bamutils_clip_left / --bamutils_clip_right  Specify the number of bases to clip off reads
       --bamutils_softclip           Use softclip instead of hard masking
 
-
-    For a full list and more information of available parameters, consider the documentation.
-
-
     Other options:
       --outdir                      The output directory where the results will be saved
       --email                       Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
@@ -114,6 +112,9 @@ def helpMessage() {
       --max_time                    Time limit for each step of the pipeline. Should be in form e.g. --max_memory '2.h'
       --max_cpus                    Maximum number of CPUs to use for each step of the pipleine. Should be in form e.g. --max_cpus 1
 
+    For a full list and more information of available parameters, consider the documentation (https://github.com/nf-core/eager/).
+
+
     """.stripIndent()
 }
 /*
@@ -146,6 +147,8 @@ params.email = false
 params.plaintext_email = false
 
 // Skipping parts of the pipeline for impatient users
+params.skip_collapse = false
+params.skip_trim = false
 params.skip_preseq = false
 params.skip_damage_calculation = false
 params.skip_qualimap = false
@@ -265,6 +268,10 @@ if( params.singleEnd || params.pairedEnd || params.bam){
     exit 1, "Please specify either --singleEnd, --pairedEnd to execute the pipeline on FastQ files and --bam for previously processed BAM files!"
 }
 
+//Validate that skip_collapse is only set to True for pairedEnd reads!
+if (params.skip_collapse  && params.singleEnd){
+    exit 1, "--noCollapse can only be set for pairedEnd samples!"
+}
 
 //AWSBatch sanity checking
 if(workflow.profile == 'awsbatch'){
@@ -349,6 +356,8 @@ summary['Reads']        = params.reads
 summary['Fasta Ref']    = params.fasta
 if(params.bwa_index) summary['BWA Index'] = params.bwa_index
 summary['Data Type']    = params.singleEnd ? 'Single-End' : 'Paired-End'
+summary['Skip Collapsing'] = params.skip_collapse ? 'Yes' : 'No'
+summary['Skip Trimming']  = params.skip_trim  ? 'Yes' : 'No' 
 summary['Max Memory']   = params.max_memory
 summary['Max CPUs']     = params.max_cpus
 summary['Max Time']     = params.max_time
@@ -368,7 +377,7 @@ if(workflow.profile == 'awsbatch'){
    summary['AWS Queue'] = params.awsqueue
 }
 if(params.email) summary['E-mail Address'] = params.email
-log.info summary.collect { k,v -> "${k.padRight(15)}: $v" }.join("\n")
+log.info summary.collect { k,v -> "${k.padRight(35)}: $v" }.join("\n")
 log.info "========================================="
 
 
@@ -508,7 +517,7 @@ process convertBam {
 
     output:
     set val("${base}"), file("*.fastq.gz") into (ch_read_files_converted_fastqc, ch_read_files_converted_fastp)
-    file("*.fastq.gz") into (ch_read_files_converted_mapping_bwa, ch_read_files_converted_mapping_cm, ch_read_files_converted_mapping_bwamem)
+    set val("${base}"), file("*.fastq.gz") into (ch_read_files_converted_mapping_bwa, ch_read_files_converted_mapping_cm, ch_read_files_converted_mapping_bwamem)
 
     script:
     base = "${bam.baseName}"
@@ -580,33 +589,58 @@ process adapter_removal {
     tag "$name"
     publishDir "${params.outdir}/read_merging", mode: 'copy'
 
+    echo true
+
     when: !params.bam
 
     input:
     set val(name), file(reads) from ( params.complexity_filter ? ch_clipped_reads_complexity_filtered : ch_read_files_clip )
 
     output:
     file "*.combined*.gz" into (ch_clipped_reads, ch_clipped_reads_for_fastqc,ch_clipped_reads_circularmapper,ch_clipped_reads_bwamem)
-    file "*.settings" into ch_adapterremoval_logs
+    file("*.settings") optional true into ch_adapterremoval_logs
 
     script:
-    prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/
-    //Readprefixing only required for PE data with merging
-    fixprefix = (params.singleEnd) ? "" : "AdapterRemovalFixPrefix ${prefix}.combined.fq.gz ${prefix}.combined.prefixed.fq.gz"
+    base = reads[0].baseName
 
-    if( !params.singleEnd ){
+    if( !params.singleEnd && !params.skip_collapse && !params.skip_trim){
     """
-    AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${prefix} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} --collapse
+    AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} --collapse
     #Combine files
-    zcat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz | gzip > ${prefix}.combined.fq.gz
-    ${fixprefix}
-    rm ${prefix}.combined.fq.gz
+    zcat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz | gzip > ${base}.combined.fq.gz
     """
-    } else {
+    } else if (!params.singleEnd && params.skip_collapse && !params.skip_trim) {
     """
-    AdapterRemoval --file1 ${reads[0]} --basename ${prefix} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} 
+    AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+    #Rename files
+    mv ${base}.pair1.truncated.gz ${base}.pair1.combined.fq.gz
+    mv ${base}.pair2.truncated.gz ${base}.pair2.combined.fq.gz
+    """
+    } else if (!params.singleEnd && !params.skip_collapse && params.skip_trim) {
+    bogus_adaptor = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"
+    """
+    AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} --basename ${base} --collapse --adapter1 $bogus_adaptor --adapter2 $bogus_adaptor
+    #Rename files
+    mv ${base}.pair1.truncated.gz ${base}.pair1.combined.fq.gz
+    mv ${base}.pair2.truncated.gz ${base}.pair2.combined.fq.gz
+    """
+    } else if (params.singleEnd && params.skip_collapse && params.skip_trim){
+    """
+    mv ${reads[0]} ${base}.combined.fq.gz
+    echo "Skipped trimming and merging by AdapterRemoval"
+    """
+    } else if (params.pairedEnd && params.skip_collapse && params.skip_trim){
+    """
+    mv ${reads[0]} ${base}.pair1.combined.fq.gz
+    mv ${reads[1]} ${base}.pair2.combined.fq.gz
+    echo "Skipped trimming and merging by AdapterRemoval"
+    """
+    }
+    else {
+    """
+    AdapterRemoval --file1 ${reads[0]} --basename ${base} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} 
     # Pseudo-Combine
-    mv *.truncated.gz ${prefix}.combined.fq.gz
+    mv *.truncated.gz ${base}.combined.fq.gz
     """
     }
 }
@@ -615,7 +649,7 @@ process adapter_removal {
  * STEP 2.1 - FastQC after clipping/merging (if applied!)
  */
 process fastqc_after_clipping {
-    tag "${reads[0].baseName}"
+    tag "${prefix}"
     publishDir "${params.outdir}/FastQC/after_clipping", mode: 'copy',
         saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"}
 
@@ -628,6 +662,7 @@ process fastqc_after_clipping {
     file "*_fastqc.{zip,html}" optional true into ch_fastqc_after_clipping
 
     script:
+    prefix = reads[0].toString().tokenize('.')[0]
     """
     fastqc -q $reads
     """
@@ -654,13 +689,24 @@ process bwa {
 
 
     script:
-    prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/
-    fasta = "${index}/*.fasta" 
+    fasta = "${index}/*.fasta"
+    if (!params.singleEnd && params.skip_collapse ){
+    prefix = reads[0].toString().tokenize('.')[0]
     """ 
-    bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai"
-    bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam
-    samtools index "${prefix}".sorted.bam
+    bwa aln -t ${task.cpus} $fasta ${reads[0]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r1.sai
+    bwa aln -t ${task.cpus} $fasta ${reads[1]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r2.sai
+    bwa sampe -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.r1.sai ${prefix}.r2.sai ${reads[0]} ${reads[1]} | samtools sort -@ ${task.cpus} -O bam - > ${prefix}.sorted.bam
+    samtools index ${prefix}.sorted.bam
     """
+    } else {
+    prefix = reads[0].toString().tokenize('.')[0]
+    """ 
+    bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.sai
+    bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam
+    samtools index ${prefix}.sorted.bam
+    """
+    }
+
 }
 
 process circulargenerator{
@@ -708,16 +754,30 @@ process circularmapper{
 
     script:
     filter = "${params.circularfilter}" ? '' : '-f true -x false'
-    prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/
+
     fasta = "${index}/*_*.fasta"
 
+    if (!params.singleEnd && params.skip_collapse ){
+    prefix = reads[0].toString().tokenize('.')[0]
+    """ 
+    bwa aln -t ${task.cpus} $fasta ${reads[0]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r1.sai
+    bwa aln -t ${task.cpus} $fasta ${reads[1]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r2.sai
+    bwa sampe -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.r1.sai ${prefix}.r2.sai ${reads[0]} ${reads[1]} > tmp.out
+    realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter 
+    samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${prefix}.sorted.bam
+    samtools index ${prefix}.sorted.bam
+    """
+    } else {
+    prefix = reads[0].toString().tokenize('.')[0]
     """ 
-    bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai"
-    bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads > tmp.out
+    bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.sai
+    bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.sai $reads > tmp.out
     realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter 
-    samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${prefix}".sorted.bam
-    samtools index "${prefix}".sorted.bam
+    samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${prefix}.sorted.bam
+    samtools index ${prefix}.sorted.bam
     """
+    }
+
 }
 
 process bwamem {
@@ -738,10 +798,18 @@ process bwamem {
     script:
     prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/
     fasta = "${index}/*.fasta"
+    if (!params.singleEnd && params.skip_collapse ){
+    """
+    bwa mem -t ${task.cpus} $fasta ${reads[0]} ${reads[1]} -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam
+    samtools index -@ ${task.cpus} "${prefix}".sorted.bam
+    """
+    } else {
     """
     bwa mem -t ${task.cpus} $fasta $reads -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam
     samtools index -@ ${task.cpus} "${prefix}".sorted.bam
     """
+    }
+
 }
 
 /*