diff --git a/.travis.yml b/.travis.yml index 459e925cb..dab87ae31 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,6 +40,12 @@ script: - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --saveReference # Run the basic pipeline with single end data (pretending its single end actually) - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd --bwa_index results/reference_genome/bwa_index/bwa_index/ + # Run the basic pipeline with paired end data without collapsing + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_collapse --saveReference + # Run the basic pipeline with paired end data without trimming + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference + # Run the basic pipeline with paired end data without adapterRemoval + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference # Run the same pipeline testing optional step: fastp, complexity - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/ # Test BAM Trimming diff --git a/CHANGELOG.md b/CHANGELOG.md index 661f73ea6..e281225a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#152](https://github.com/nf-core/eager/pull/152) - Clarified `--complexity_filter` flag to be specifically for poly G trimming. * [#155](https://github.com/nf-core/eager/pull/155) - Added [Dedup log to output folders](https://github.com/nf-core/eager/issues/154) +* [#159](https://github.com/nf-core/eager/pull/159) - Added Possibility to skip AdapterRemoval, skip merging, skip trimming fixing [#64](https://github.com/nf-core/eager/issues/64),[#137](https://github.com/nf-core/eager/issues/137) - thanks to @maxibor, @jfy133 ### `Fixed` diff --git a/docs/usage.md b/docs/usage.md index ed12d1d4a..5a848760b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -283,12 +283,16 @@ This part of the documentation contains a list of user-adjustable parameters in ## Step skipping parameters -Some of the steps in the pipeline can be executed optionally. If you specify specific steps to be skipped, there won't be any output related to these modules. +Some of the steps in the pipeline can be executed optionally. If you specify specific steps to be skipped, there won't be any output related to these modules. ### `--skip_preseq` Turns off the computation of library complexity estimation. +### `--skip_adapterremoval` + +Turns off adaptor trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`. + ### `--skip_damage_calculation` Turns off the DamageProfiler module to compute DNA damage profiles. @@ -333,6 +337,24 @@ Defines the minimum read quality per base that is required for a base to be kept ### `--clip_min_adap_overlap` 1 Sets the minimum overlap between two reads when read merging is performed. Default is set to `1` base overlap. +### `--skip_collapse` + +Turns off the paired-end read merging. + +For example +```bash +--pairedEnd --skip_collapse --reads '*.fastq' +``` + +### `--skip_trim` + +Turns off adaptor and quality trimming. + +For example: +```bash +--pairedEnd --skip_trim --reads '*.fastq' +``` + ## Read Mapping Parameters ## BWA (default) diff --git a/main.nf b/main.nf index ee465329a..5b807c8ef 100644 --- a/main.nf +++ b/main.nf @@ -44,6 +44,7 @@ def helpMessage() { --saveReference Saves reference genome indices for later reusage Skipping Skip any of the mentioned steps + --skip_adapterremoval --skip_preseq --skip_damage_calculation --skip_qualimap @@ -59,6 +60,8 @@ def helpMessage() { --clip_readlength Specify read minimum length to be kept for downstream analysis --clip_min_read_quality Specify minimum base quality for not trimming off bases --min_adap_overlap Specify minimum adapter overlap + --skip_collapse Skip merging forward and reverse reads together. (Only for PE samples) + --skip_trim Skip adaptor and quality trimming BWA Mapping --bwaalnn Specify the -n parameter for BWA aln. @@ -145,6 +148,7 @@ params.email = false params.plaintext_email = false // Skipping parts of the pipeline for impatient users +params.skip_adapterremoval = false params.skip_preseq = false params.skip_damage_calculation = false params.skip_qualimap = false @@ -160,6 +164,8 @@ params.clip_reverse_adaptor = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA" params.clip_readlength = 30 params.clip_min_read_quality = 20 params.min_adap_overlap = 1 +params.skip_collapse = false +params.skip_trim = false //Read mapping parameters (default = BWA aln default) params.bwaalnn = 0.04 @@ -258,6 +264,10 @@ if( params.singleEnd || params.pairedEnd || params.bam){ exit 1, "Please specify either --singleEnd, --pairedEnd to execute the pipeline on FastQ files and --bam for previously processed BAM files!" } +//Validate that skip_collapse is only set to True for pairedEnd reads! +if (params.skip_collapse && params.singleEnd){ + exit 1, "--skip_collapse can only be set for pairedEnd samples!" +} //AWSBatch sanity checking if(workflow.profile == 'awsbatch'){ @@ -343,6 +353,8 @@ summary['Fasta Ref'] = params.fasta summary['BAM Index Type'] = (params.large_ref == "") ? 'BAI' : 'CSI' if(params.bwa_index) summary['BWA Index'] = params.bwa_index summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' +summary['Skip Collapsing'] = params.skip_collapse ? 'Yes' : 'No' +summary['Skip Trimming'] = params.skip_trim ? 'Yes' : 'No' summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus summary['Max Time'] = params.max_time @@ -362,7 +374,7 @@ if(workflow.profile == 'awsbatch'){ summary['AWS Queue'] = params.awsqueue } if(params.email) summary['E-mail Address'] = params.email -log.info summary.collect { k,v -> "${k.padRight(15)}: $v" }.join("\n") +log.info summary.collect { k,v -> "${k.padRight(35)}: $v" }.join("\n") log.info "=========================================" @@ -501,8 +513,7 @@ process convertBam { file bam from ch_bam_to_fastq_convert output: - set val("${base}"), file("*.fastq.gz") into (ch_read_files_converted_fastqc, ch_read_files_converted_fastp) - file("*.fastq.gz") into (ch_read_files_converted_mapping_bwa, ch_read_files_converted_mapping_cm, ch_read_files_converted_mapping_bwamem) + set val("${base}"), file("*.fastq.gz") into (ch_read_files_converted_fastqc, ch_read_files_converted_fastp, ch_read_files_converted_mapping_bwa, ch_read_files_converted_mapping_cm, ch_read_files_converted_mapping_bwamem) script: base = "${bam.baseName}" @@ -568,60 +579,87 @@ process fastp { /* * STEP 2 - Adapter Clipping / Read Merging */ - - +//Initialize empty channel if we skip adapterremoval entirely +if(params.skip_adapterremoval) { + //No logs if no AR is run + ch_adapterremoval_logs = Channel.empty() + //Either coming from complexity filtering, or directly use reads normally directed to clipping first and push them through to the other channels downstream! + ch_clipped_reads_complexity_filtered_poly_g.mix(ch_read_files_clip).into { ch_clipped_reads;ch_clipped_reads_for_fastqc;ch_clipped_reads_circularmapper;ch_clipped_reads_bwamem } +} else { process adapter_removal { tag "$name" publishDir "${params.outdir}/read_merging", mode: 'copy' - when: !params.bam + when: !params.bam && !params.skip_adapterremoval input: set val(name), file(reads) from ( params.complexity_filter_poly_g ? ch_clipped_reads_complexity_filtered_poly_g : ch_read_files_clip ) output: - file "*.combined*.gz" into (ch_clipped_reads, ch_clipped_reads_for_fastqc,ch_clipped_reads_circularmapper,ch_clipped_reads_bwamem) - file "*.settings" into ch_adapterremoval_logs + set val(base), file("output/*.gz") into (ch_clipped_reads,ch_clipped_reads_for_fastqc,ch_clipped_reads_circularmapper,ch_clipped_reads_bwamem) + file("*.settings") into ch_adapterremoval_logs script: - prefix = reads[0].toString() - ~/(_R1)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ - //Readprefixing only required for PE data with merging - fixprefix = (params.singleEnd) ? "" : "AdapterRemovalFixPrefix ${prefix}.combined.fq.gz ${prefix}.combined.prefixed.fq.gz" + base = reads[0].baseName + //This checks whether we skip trimming and defines a variable respectively + trim_me = params.skip_trim ? '' : "--trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}" + collapse_me = params.skip_collapse ? '' : '--collapse' - if( !params.singleEnd ){ + //PE mode, dependent on trim_me and collapse_me the respective procedure is run or not :-) + if (!params.singleEnd && !params.skip_collapse && !params.skip_trim){ """ - AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${prefix} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} --collapse + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} ${trim_me} --gzip --threads ${task.cpus} ${collapse_me} #Combine files - zcat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz | gzip > ${prefix}.combined.fq.gz - ${fixprefix} - rm ${prefix}.combined.fq.gz + zcat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz | gzip > output/${base}.combined.fq.gz + """ + //PE, don't collapse, but trim reads + } else if (!params.singleEnd && params.skip_collapse && !params.skip_trim) { + """ + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} ${trim_me} ${collapse_me} + mv ${base}.pair*.truncated.gz output/ + """ + //PE, collapse, but don't trim reads + } else if (!params.singleEnd && !params.skip_collapse && params.skip_trim) { + """ + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --file2 ${reads[1]} --basename ${base} --gzip --threads ${task.cpus} --basename ${base} ${collapse_me} ${trim_me} + + mv ${base}.pair*.truncated.gz output/ """ } else { + //SE, collapse not possible, trim reads """ - AdapterRemoval --file1 ${reads[0]} --basename ${prefix} --gzip --threads ${task.cpus} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} - # Pseudo-Combine - mv *.truncated.gz ${prefix}.combined.fq.gz + mkdir -p output + AdapterRemoval --file1 ${reads[0]} --basename ${base} --gzip --threads ${task.cpus} ${trim_me} + + mv *.truncated.gz output/ """ } } +} + + /* - * STEP 2.1 - FastQC after clipping/merging (if applied!) - */ +* STEP 2.1 - FastQC after clipping/merging (if applied!) +*/ process fastqc_after_clipping { - tag "${reads[0].baseName}" + tag "${prefix}" publishDir "${params.outdir}/FastQC/after_clipping", mode: 'copy', saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} - when: !params.bam + when: !params.bam && !params.skip_adapterremoval input: - file(reads) from ch_clipped_reads_for_fastqc + set val(name), file(reads) from ch_clipped_reads_for_fastqc output: file "*_fastqc.{zip,html}" optional true into ch_fastqc_after_clipping script: + prefix = reads[0].toString().tokenize('.')[0] """ fastqc -q $reads """ @@ -638,7 +676,8 @@ process bwa { when: !params.circularmapper && !params.bwamem input: - file(reads) from ch_clipped_reads.mix(ch_read_files_converted_mapping_bwa) + set val(name), file(reads) from ch_clipped_reads.mix(ch_read_files_converted_mapping_bwa) + file index from ch_bwa_index.first() @@ -648,14 +687,28 @@ process bwa { script: - prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ - fasta = "${index}/*.fasta" + fasta = "${index}/*.fasta" size = "${params.large_ref}" ? '-c' : '' + + //PE data without merging, PE data without any AR applied + if (!params.singleEnd && (params.skip_collapse || params.skip_adapterremoval)){ + prefix = reads[0].toString().tokenize('.')[0] """ - bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" - bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam + bwa aln -t ${task.cpus} $fasta ${reads[0]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r1.sai + bwa aln -t ${task.cpus} $fasta ${reads[1]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r2.sai + bwa sampe -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.r1.sai ${prefix}.r2.sai ${reads[0]} ${reads[1]} | samtools sort -@ ${task.cpus} -O bam - > ${prefix}.sorted.bam samtools index "${size}" "${prefix}".sorted.bam """ + } else { + //PE collapsed, or SE data + prefix = reads[0].toString().tokenize('.')[0] + """ + bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.sai + bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam + samtools index "${size}" "${prefix}".sorted.bam + """ + } + } process circulargenerator{ @@ -694,7 +747,7 @@ process circularmapper{ when: params.circularmapper input: - file reads from ch_clipped_reads_circularmapper.mix(ch_read_files_converted_mapping_cm) + set val(name), file(reads) from ch_clipped_reads_circularmapper.mix(ch_read_files_converted_mapping_cm) file index from ch_circularmapper_indices.first() output: @@ -703,17 +756,31 @@ process circularmapper{ script: filter = "${params.circularfilter}" ? '' : '-f true -x false' - prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ + fasta = "${index}/*_*.fasta" size = "${params.large_ref}" ? '-c' : '' + if (!params.singleEnd && params.skip_collapse ){ + prefix = reads[0].toString().tokenize('.')[0] """ - bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" - bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads > tmp.out + bwa aln -t ${task.cpus} $fasta ${reads[0]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r1.sai + bwa aln -t ${task.cpus} $fasta ${reads[1]} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.r2.sai + bwa sampe -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.r1.sai ${prefix}.r2.sai ${reads[0]} ${reads[1]} > tmp.out + realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter + samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${prefix}.sorted.bam + samtools index "${size}" ${prefix}.sorted.bam + """ + } else { + prefix = reads[0].toString().tokenize('.')[0] + """ + bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${prefix}.sai + bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta ${prefix}.sai $reads > tmp.out realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${prefix}".sorted.bam samtools index "${size}" "${prefix}".sorted.bam """ + } + } process bwamem { @@ -723,7 +790,7 @@ process bwamem { when: params.bwamem && !params.circularmapper input: - file(reads) from ch_clipped_reads_bwamem.mix(ch_read_files_converted_mapping_bwamem) + set val(name), file(reads) from ch_clipped_reads_bwamem.mix(ch_read_files_converted_mapping_bwamem) file index from ch_bwa_index_bwamem.first() output: @@ -735,10 +802,19 @@ process bwamem { prefix = reads[0].toString() - ~/(_R1)?(\.combined\.)?(prefixed)?(_trimmed)?(_val_1)?(\.fq)?(\.fastq)?(\.gz)?$/ fasta = "${index}/*.fasta" size = "${params.large_ref}" ? '-c' : '' + + if (!params.singleEnd && params.skip_collapse){ + """ + bwa mem -t ${task.cpus} $fasta ${reads[0]} ${reads[1]} -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam + samtools index "${size}" -@ ${task.cpus} "${prefix}".sorted.bam + """ + } else { """ bwa mem -t ${task.cpus} $fasta $reads -R "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam - samtools index "${size}" -@ ${task.cpus} "${prefix}".sorted.bam + samtools index "${size}" -@ ${task.cpus} "${prefix}".sorted.bam """ + } + } /* diff --git a/nextflow.config b/nextflow.config index 150d9bc25..9fcbb2404 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { //Pipeline options aligner = 'bwa' + saveReference = false saveTrimmed = true saveAlignedIntermediates = false @@ -30,6 +31,11 @@ params { complexity_filter_poly_g_min = 10 trim_bam = false + //Skipping adapterremoval, trimming or collapsing defaults + skip_adapterremoval = false + skip_trim = false + skip_adapterremoval = false + // AWS Batch awsqueue = false awsregion = 'eu-west-1' @@ -38,6 +44,7 @@ params { custom_config_version = 'master' } + // Load base.config by default for all pipelines includeConfig 'conf/base.config'