diff --git a/.travis.yml b/.travis.yml index 2b056242c..65e95c56f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ before_install: # Pull the docker image first so the test doesn't wait for this - docker pull nfcore/eager # Fake the tag locally so that the pipeline runs properly - - docker tag nfcore/eager nfcore/eager:2.0.2 + - docker tag nfcore/eager nfcore/eager:latest install: # Install Nextflow @@ -37,16 +37,16 @@ script: # Lint the pipeline code - nf-core lint ${TRAVIS_BUILD_DIR} # Run the basic pipeline with the test profile - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --saveReference # Run the basic pipeline with single end data (pretending its single end actually) - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd --bwa_index results/reference_genome/bwa_index/ # Run the same pipeline testing optional step: fastp, complexity - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/ # Test BAM Trimming - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --trim_bam + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --trim_bam --bwa_index results/reference_genome/bwa_index/ # Test running with CircularMapper - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --circularmapper --circulartarget 'NC_007596.2' # Test running with BWA Mem - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --bwamem + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --bwamem --bwa_index results/reference_genome/bwa_index/ # Test basic pipeline with Conda too - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,conda --pairedEnd \ No newline at end of file + - travis_wait 25 nextflow run ${TRAVIS_BUILD_DIR} -profile test,conda --pairedEnd --bwa_index results/reference_genome/bwa_index/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index fc60f786c..3bd13fa21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## unpublished +## [Unpublished] +## [2.0.3] - 2018-12-09 + +### `Added` +* [#80](https://github.com/nf-core/eager/pull/80) - BWA Index file handling +* [#77](https://github.com/nf-core/eager/pull/77) - Lots of documentation updates by [@jfy133](https://github.com/jfy133) +* [#81](https://github.com/nf-core/eager/pull/81) - Renaming of certain BAM options +* [#92](https://github.com/nf-core/eager/issues/92) - Complete restructure of BAM options + +### `Fixed` +* [#84](https://github.com/nf-core/eager/pull/85) - Fix for [Samtools index issues](https://github.com/nf-core/eager/issues/84) +* [#96](https://github.com/nf-core/eager/issues/96) - Fix for [MarkDuplicates issues](https://github.com/nf-core/eager/issues/96) found by [@nilesh-tawari](https://github.com/nilesh-tawari) + +### Other +* Added Slack button to repository readme + ## [2.0.2] - 2018-11-03 ### `Changed` diff --git a/Dockerfile b/Dockerfile index de193795c..18dce4985 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,4 +3,4 @@ FROM nfcore/base LABEL description="Docker image containing all requirements for nf-core/eager pipeline" COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-eager-2.0.2/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.0.3dev/bin:$PATH diff --git a/README.md b/README.md index 1b1247d07..0f6c3ee29 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,7 @@ [![Build Status](https://travis-ci.org/nf-core/eager.svg?branch=master)](https://travis-ci.org/nf-core/eager) [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.32.0-brightgreen.svg)](https://www.nextflow.io/) -[![Gitter](https://img.shields.io/badge/gitter-%20join%20chat%20%E2%86%92-4fb99a.svg)](https://gitter.im/nf-core/eager) -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) +[![Slack Status](https://nf-core-invite.herokuapp.com/badge.svg)](https://nf-core-invite.herokuapp.com)[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) [![Docker Container available](https://img.shields.io/docker/automated/nfcore/eager.svg)](https://hub.docker.com/r/nfcore/eager/) ![Singularity Container available](https://img.shields.io/badge/singularity-available-7E4C74.svg) [![DOI](https://zenodo.org/badge/135918251.svg)](https://zenodo.org/badge/latestdoi/135918251) @@ -12,28 +11,60 @@ ## Introduction -**nf-core/eager** is a bioinformatics best-practice analysis pipeline for ancient DNA data analysis. +**nf-core/eager** is a bioinformatics best-practice analysis pipeline for NGS +sequencing based ancient DNA (aDNA) data analysis. -The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow tool. It pre-processes raw data from FastQ inputs, aligns the reads and performs extensive quality-control on the results. It comes with docker / singularity containers making installation trivial and results highly reproducible. +The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics +workflow tool. It pre-processes raw data from FASTQ inputs, aligns the reads +and performs extensive general NGS and aDNA specific quality-control on the +results. It comes with docker, singularity or conda containers making +installation trivial and results highly reproducible. -### Pipeline steps +## Pipeline steps -* Create reference genome indices (optional) - * BWA - * Samtools Index - * Sequence Dictionary -* QC with FastQC -* AdapterRemoval for read clipping and merging -* Read mapping with BWA, BWA Mem or CircularMapper -* Samtools sort, index, stats & conversion to BAM -* DeDup or MarkDuplicates read deduplication -* QualiMap BAM QC Checking -* Preseq Library Complexity Estimation -* DamageProfiler damage profiling -* BAM Clipping for UDG+/UDGhalf protocols -* PMDTools damage filtering / assessment +By default the pipeline currently performs the following: + +* Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`) +* Sequencing quality control (`FastQC`) +* Sequencing adapter removal and for paired end data merging (`AdapterRemoval`) +* Read mapping to reference using (`bwa aln`, `bwa mem` or `CircularMapper`) +* Post-mapping processing, statistics and conversion to bam (`samtools`) +* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) +* PCR duplicate removal (`DeDup` or `MarkDuplicates`) +* Post-mapping statistics and BAM quality control (`Qualimap`) +* Library Complexity Estimation (`preseq`) +* Overall pipeline statistics summaries (`MultiQC`) + +Additional functionality contained by the pipeline currently includes: + +* Illumina two-coloured sequencer poly-G tail removal (`fastp`) +* Automatic conversion of unmapped reads to FASTQ (`samtools`) +* Damage removal/clipping for UDG+/UDG-half treatment protocols (`BamUtil`) +* Damage reads extraction and assessment (`PMDTools`) + +## Quick Start + +1. Install [`nextflow`](docs/installation.md) +2. Install one of [`docker`](https://docs.docker.com/engine/installation/), [`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`conda`](https://conda.io/miniconda.html) +3. Download the EAGER pipeline + +```bash +nextflow pull nf-core/eager +``` + +4. Set up your job with default parameters + +```bash +nextflow run nf-core -profile --reads'*_R{1,2}.fastq.gz' --fasta '/MultiQC/multiqc_report.html` + +Modifications to the default pipeline are easily made using various options +as described in the documentation. + +## Documentation -### Documentation The nf-core/eager pipeline comes with documentation about the pipeline, found in the `docs/` directory: 1. [Installation](docs/installation.md) @@ -44,5 +75,25 @@ The nf-core/eager pipeline comes with documentation about the pipeline, found in 4. [Output and how to interpret the results](docs/output.md) 5. [Troubleshooting](docs/troubleshooting.md) -### Credits -This pipeline was written by Alexander Peltzer ([apeltzer](https://github.com/apeltzer)), with major contributions from Stephen Clayton, ideas and documentation from James Fellows-Yates, Raphael Eisenhofer and Judith Neukamm. If you want to contribute, please open an issue and ask to be added to the project - happy to do so and everyone is welcome to contribute here! \ No newline at end of file +## Credits + +This pipeline was written by Alexander Peltzer ([apeltzer](https://github.com/apeltzer)), +with major contributions from Stephen Clayton, ideas and documentation from +James Fellows Yates, Raphael Eisenhofer and Judith Neukamm. If you want to +contribute, please open an issue and ask to be added to the project - happy to +do so and everyone is welcome to contribute here! + +## Tool References + +* **EAGER v1**, CircularMapper, DeDup* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z) Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) +* **FastQC** download: [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +* **AdapterRemoval v2** Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. [https://doi.org/10.1186/s13104-016-1900-2](https://doi.org/10.1186/s13104-016-1900-2) Download: [https://github.com/MikkelSchubert/adapterremoval](https://github.com/MikkelSchubert/adapterremoval) +* **bwa** Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. [https://doi.org/10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324) Download: [http://bio-bwa.sourceforge.net/bwa.shtml](http://bio-bwa.sourceforge.net/bwa.shtml) +* **SAMtools** Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352) Download: [http://www.htslib.org/](http://www.htslib.org/) +* **DamageProfiler** Judith Neukamm (Unpublished) +* **QualiMap** Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. [https://doi.org/10.1093/bioinformatics/btv566](https://doi.org/10.1093/bioinformatics/btv566) Download: [http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) +* **preseq** Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature Methods, 10(4), 325–327. [https://doi.org/10.1038/nmeth.2375](https://doi.org/10.1038/nmeth.2375). Download: [http://smithlabresearch.org/software/preseq/](http://smithlabresearch.org/software/preseq/) +* **PMDTools** Skoglund, P., Northoff, B. H., Shunkov, M. V., Derevianko, A. P., Pääbo, S., Krause, J., & Jakobsson, M. (2014). Separating endogenous ancient DNA from modern day contamination in a Siberian Neandertal. Proceedings of the National Academy of Sciences of the United States of America, 111(6), 2229–2234. [https://doi.org/10.1073/pnas.1318934111](https://doi.org/10.1073/pnas.1318934111) Download: [https://github.com/pontussk/PMDtools](https://github.com/pontussk/PMDtools) +* **MultiQC** Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. [https://doi.org/10.1093/bioinformatics/btw354](https://doi.org/10.1093/bioinformatics/btw354) Download: [https://multiqc.info/](https://multiqc.info/) +* **BamUtils** Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. [https://doi.org/10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114) Download: [https://genome.sph.umich.edu/wiki/BamUtil](https://genome.sph.umich.edu/wiki/BamUtil) +* **FastP** Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. [https://doi.org/10.1093/bioinformatics/bty560](https://doi.org/10.1093/bioinformatics/bty560) Download: [https://github.com/OpenGene/fastp](https://github.com/OpenGene/fastp) diff --git a/Singularity b/Singularity index 7e1d297ef..407c69263 100644 --- a/Singularity +++ b/Singularity @@ -4,10 +4,10 @@ Bootstrap:docker %labels MAINTAINER Alexander Peltzer DESCRIPTION Container image containing all requirements for the nf-core/eager pipeline - VERSION 2.0.2 + VERSION 2.0.3dev %environment - PATH=/opt/conda/envs/nf-core-eager-2.0.2/bin:$PATH + PATH=/opt/conda/envs/nf-core-eager-2.0.3dev/bin:$PATH export PATH %files diff --git a/conf/acad-pheonix.config b/conf/acad-pheonix.config new file mode 100644 index 000000000..f789eff74 --- /dev/null +++ b/conf/acad-pheonix.config @@ -0,0 +1,23 @@ +/* + * ---------------------------------------------------------------------------- + * Nextflow config file for use with Singularity on Phoenix Cluster Adelaide + * ---------------------------------------------------------------------------- + * Defines basic usage limits and singularity image id. + */ + +singularity { + enabled = true + envWhitelist='SINGULARITY_BINDPATH' + autoMounts = true +} + +process { + beforeScript = 'module load Singularity/2.5.2-GCC-5.4.0-2.26' + executor = 'slurm' +} + +params { + max_memory = 128.GB + max_cpus = 32 + max_time = 48.h +} diff --git a/conf/binac.config b/conf/binac.config index c35e58c92..68ba98f1e 100644 --- a/conf/binac.config +++ b/conf/binac.config @@ -10,7 +10,7 @@ singularity { } process { - beforeScript = 'module load devel/singularity/2.4.1' + beforeScript = 'module load devel/singularity/2.6.0' executor = 'pbs' queue = 'short' } diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml index d714e8ee0..d5f5a6540 100644 --- a/conf/multiqc_config.yaml +++ b/conf/multiqc_config.yaml @@ -5,3 +5,15 @@ report_comment: > report_section_order: nf-core/eager-software-versions: order: -1000 + fastqc: + after: 'nf-core/eager-software-versions' + adapterRemoval: + after: 'fastqc' + Samtools: + after: 'adapterRemoval' + dedup: + after: 'Samtools' + qualimap: + after: 'dedup' + preseq: + after: 'qualimap' \ No newline at end of file diff --git a/conf/shh.config b/conf/shh.config new file mode 100644 index 000000000..8b7b6ade0 --- /dev/null +++ b/conf/shh.config @@ -0,0 +1,36 @@ +/* + * ------------------------------------------------------------- + * Nextflow config file for use with Singularity at SHH Clusters + * ------------------------------------------------------------- + * Defines basic usage limits and singularity image id. + */ + +singularity { + enabled = true +} + +/* +* To be improved by process specific resource requests +* By default, take the medium queue, smaller processes might just go to short (e.g. multiqc or similar things) +*/ + +process { + executor = 'slurm' + queue = 'medium' + + + withName:makeFastaIndex { + queue = 'short' + time = 2.h + } + withName:makeSeqDict { + queue = 'short' + time = 2.h + } +} + +params { + max_memory = 734.GB + max_cpus = 64 + max_time = 48.h +} diff --git a/docs/configuration/adding_your_own.md b/docs/configuration/adding_your_own.md index 62bfca9c9..01b08b060 100644 --- a/docs/configuration/adding_your_own.md +++ b/docs/configuration/adding_your_own.md @@ -51,7 +51,6 @@ Note that the dockerhub organisation name annoyingly can't have a hyphen, so is ### Singularity image Many HPC environments are not able to run Docker due to security issues. [Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. ->>>>>>> TEMPLATE To specify singularity usage in your pipeline config file, add the following: @@ -81,5 +80,4 @@ To use conda in your own config file, add the following: ```nextflow process.conda = "$baseDir/environment.yml" ->>>>>>> TEMPLATE ``` diff --git a/docs/configuration/local.md b/docs/configuration/local.md index 09a6d3adf..dc8efa52d 100644 --- a/docs/configuration/local.md +++ b/docs/configuration/local.md @@ -11,7 +11,7 @@ First, install docker on your system: [Docker Installation Instructions](https:/ Then, simply run the analysis pipeline: ```bash -nextflow run nf-core/eager -profile docker --reads '' +nextflow run nf-core/eager -profile docker --reads '' --pairedEnd ``` Nextflow will recognise `nf-core/eager` and download the pipeline from GitHub. The `-profile docker` configuration lists the [nf-core/eager](https://hub.docker.com/r/nfcore/eager/) image that we have created and is hosted at dockerhub, and this is downloaded. @@ -23,9 +23,13 @@ The public docker images are tagged with the same version numbers as the code, w ## Singularity image -Many HPC environments are not able to run Docker due to security issues. [Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. Even better, it can use create images directly from dockerhub. +Many HPC environments are not able to run Docker due to security issues. [Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. There is a particular profile that will download the singularity image for you. -To use the singularity image for a single run, use `-with-singularity 'docker://nfcore/eager'`. This will download the docker container from dockerhub and create a singularity image for you dynamically. +```bash +nextflow run nf-core/eager -profile singularity --reads '' --pairedEnd +``` + +Additionally, it can use create images directly from dockerhub. To use the singularity image for a single run, use `-with-singularity 'docker://nfcore/eager'`. This will download the docker container from dockerhub and create a singularity image for you dynamically. If you intend to run the pipeline offline, nextflow will not be able to automatically download the singularity image for you. Instead, you'll have to do this yourself manually first, transfer the image file and then point to that. @@ -38,5 +42,14 @@ singularity pull --name nf-core-eager.img docker://nfcore/eager Then transfer this file and run the pipeline with this path: ```bash -nextflow run /path/to/nf-core/eager -with-singularity /path/to/nf-core-eager.img +nextflow run /path/to/nf-core/eager -with-singularity /path/to/nf-core-eager.img --reads --pairedEnd ``` + +## Conda + +You may also use conda (utilising the bioconda repository) to download the pipeline dependencies for you. + +```bash +nextflow run nf-core/eager -profile conda --reads '' --pairedEnd +``` + diff --git a/docs/installation.md b/docs/installation.md index 41022e1fa..4bbd71614 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -69,6 +69,19 @@ Be warned of two important points about this default configuration: * See the [nextflow docs](https://www.nextflow.io/docs/latest/executor.html) for information about running with other hardware backends. Most job scheduler systems are natively supported. 2. Nextflow will expect all software to be installed and available on the `PATH` +The following software is currently required to be installed: + +* [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +* [Picard Tools](https://broadinstitute.github.io/picard/) +* [Samtools](http://www.htslib.org/) +* [Preseq](http://smithlabresearch.org/software/preseq/) +* [MultiQC](https://multiqc.info/) +* [BWA](http://bio-bwa.sourceforge.net/) +* [Qualimap](http://qualimap.bioinfo.cipf.es/) +* [GATK](https://software.broadinstitute.org/gatk/) +* [bamUtil](https://genome.sph.umich.edu/wiki/BamUtil) +* [fastP](https://github.com/OpenGene/fastp) + #### 3.1) Software deps: Docker First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) @@ -99,6 +112,14 @@ This is slower and less reproducible than the above, but is still better than ha The pipeline ships with a conda environment file and nextflow has built-in support for this. To use it first ensure that you have conda installed (we recommend [miniconda](https://conda.io/miniconda.html)), then follow the same pattern as above and use the flag `-profile standard,conda` +#### 4) Profile configuration +Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. + +It is recommended to limit the Nextflow Java virtual machines memory. We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): + +```bash +NXF_OPTS='-Xms1g -Xmx4g' +``` ## Appendices @@ -111,5 +132,4 @@ Note that you will need to specify your UPPMAX project ID when running a pipelin ```nextflow params.project = 'project_ID' // eg. b2017123 ->>>>>>> TEMPLATE ``` diff --git a/docs/usage.md b/docs/usage.md index 16322979d..b42664892 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -7,48 +7,14 @@ * [Updating the pipeline](#updating-the-pipeline) * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) - * [`-profile`](#-profile-single-dash) - * [`docker`](#docker) - * [`awsbatch`](#awsbatch) - * [`standard`](#standard) - * [`binac`](#binac) - * [`cfc`](#cfc) - * [`uzh`](#uzh) - * [`none`](#none) - * [`--reads`](#--reads) - * [`--singleEnd`](#--singleend) -* [Reference Genomes](#reference-genomes) - * [`--genome`](#--genome) - * [`--fasta`](#--fasta) * [Job Resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) * [Custom resource requests](#custom-resource-requests) * [AWS batch specific parameters](#aws-batch-specific-parameters) - * [`-awsbatch`](#-awsbatch) - * [`--awsqueue`](#--awsqueue) - * [`--awsregion`](#--awsregion) * [Other command line parameters](#other-command-line-parameters) - * [`--outdir`](#--outdir) - * [`--email`](#--email) - * [`-name`](#-name-single-dash) - * [`-resume`](#-resume-single-dash) - * [`-c`](#-c-single-dash) - * [`--max_memory`](#--max_memory) - * [`--max_time`](#--max_time) - * [`--max_cpus`](#--max_cpus) - * [`--plaintext_emails`](#--plaintext_emails) - * [`--sampleLevel`](#--sampleLevel) - * [`--multiqc_config`](#--multiqc_config) - -## General Nextflow info -Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. - -It is recommended to limit the Nextflow Java virtual machines memory. We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): - -```bash -NXF_OPTS='-Xms1g -Xmx4g' -``` +* [Adjustable parameters for nf-core/eager](#adjustable-parameters-for-nf-coreeager) +## Preamble To access the nextflow help message run: `nextflow run -help` ## Running the pipeline @@ -116,7 +82,6 @@ Use this parameter to choose a configuration profile. Profiles can give configur * `test` * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters ->>>>>>> TEMPLATE * `none` * No configuration at all. Useful if you want to build your own config from scratch and want to avoid loading in the default `base` config profile (not recommended). @@ -155,9 +120,18 @@ A normal glob pattern, enclosed in quotation marks, can then be used for `--read ## Reference Genomes -The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. +### `--fasta` +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--fasta '[path to Fasta reference]' +``` +> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`. ### `--genome` (using iGenomes) + +The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. + There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are: @@ -189,14 +163,6 @@ params { } ``` -### `--fasta` -If you prefer, you can specify the full path to your reference genome when you run the pipeline: - -```bash ---fasta '[path to Fasta reference]' -``` -> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`. - ### `--bwa_index` Use this to specify a previously created BWA index. This saves time in pipeline execution and is especially advised when running multiple times on the same cluster system for example. You can even add a resource specific profile that sets paths to pre-computed reference genomes, saving even time when specifying these. @@ -409,17 +375,13 @@ Turn this on to utilize BWA Mem instead of `bwa aln` for alignment. Can be quite Users can configure to keep/discard/extract certain groups of reads efficiently in the nf-core/eager pipeline. -### `--bam_keep_mapped_only` - -This can be used to only keep mapped reads for downstream analysis. By default turned off, all reads are kept in the BAM file. Unmapped reads are stored both in BAM and FastQ format e.g. for different downstream processing. - -### `--bam_keep_all` +### `--bam_discard_unmapped` -Turned on by default, keeps all reads that were mapped in the dataset. +Defines whether unmapped reads should be discarded and stored in FastQ and/or BAM format separately. The behaviour depends on the choice of the `--bam_unmapped_type`. -### `--bam_filter_reads` +### `--bam_unmapped_type` -Specify this, if you want to filter reads for downstream analysis. +Defines how to proceed with unmapped reads: "discard" removes all unmapped reads, "bam" keeps unmapped reads as BAM file, "fastq" keeps unmapped reads as FastQ file, "both" keeps both BAM and FastQ files. Only effective when option `--bam_discard_unmapped` is turned on. ### `--bam_mapping_quality_threshold` diff --git a/environment.yml b/environment.yml index c8cbe7309..e8d5738a7 100644 --- a/environment.yml +++ b/environment.yml @@ -1,30 +1,30 @@ -name: nf-core-eager-2.0.2 +name: nf-core-eager-2.0.3dev channels: - defaults - bioconda - conda-forge dependencies: - - defaults::openjdk=8.0.152 - - fastqc=0.11.8 - - adapterremoval=2.2.2 - - adapterremovalfixprefix=0.0.4 - - bwa=0.7.17 - - picard=2.18.15 - - samtools=1.9 - - dedup=0.12.3 - - angsd=0.923 - - circularmapper=1.93.4 - - gatk4=4.0.10.0 - - qualimap=2.2.2b - - vcf2genome=0.91 - - damageprofiler=0.3.11 - - multiqc=1.6 - - pmdtools=0.60 - - r-rmarkdown=1.10 - - libiconv=1.15 + - anaconda::openjdk=8.0.152 + - bioconda::fastqc=0.11.8 + - bioconda::adapterremoval=2.2.2 + - bioconda::adapterremovalfixprefix=0.0.4 + - bioconda::bwa=0.7.17 + - bioconda::picard=2.18.15 + - bioconda::samtools=1.9 + - bioconda::dedup=0.12.3 + - bioconda::angsd=0.923 + - bioconda::circularmapper=1.93.4 + - bioconda::gatk4=4.0.11.0 + - bioconda::qualimap=2.2.2b + - bioconda::vcf2genome=0.91 + - bioconda::damageprofiler=0.3.11 + - bioconda::multiqc=1.6 + - bioconda::pmdtools=0.60 + - conda-forge::r-rmarkdown=1.10 + - conda-forge::libiconv=1.15 - conda-forge::pigz=2.3.4 - - sequencetools=1.2.2 - - preseq=2.0.3 - - fastp=0.19.4 - - bamutil=1.0.14 + - bioconda::sequencetools=1.2.2 + - bioconda::preseq=2.0.3 + - bioconda::fastp=0.19.4 + - bioconda::bamutil=1.0.14 #Missing Schmutzi,snpAD diff --git a/main.nf b/main.nf index f6f23f457..bf3371045 100644 --- a/main.nf +++ b/main.nf @@ -26,17 +26,18 @@ def helpMessage() { Mandatory arguments: --reads Path to input data (must be surrounded with quotes) - -profile Hardware config to use. docker / aws + -profile Hardware config to use (e.g. standard, docker, singularity, conda, aws). Ask your system admin if unsure, or check documentatoin. + --singleEnd Specifies that the input is single end reads (required if not pairedEnd) + --pairedEnd Specifies that the input is paired end reads (required if not singleend) + --fasta Path to Fasta reference (required if not iGenome reference) + --genome Name of iGenomes reference (required if not fasta reference) - Options: - --genome Name of iGenomes reference - --singleEnd Specifies that the input is single end reads + Input Data Additional Options: --snpcapture Runs in SNPCapture mode (specify a BED file if you do this!) --udg Specify that your libraries are treated with UDG --udg_type Specify here if you have UDG half treated libraries, Set to 'Half' in that case References If not specified in the configuration file or you wish to overwrite any of the references. - --fasta Path to Fasta reference --bwa_index Path to BWA index --bedfile Path to BED file for SNPCapture methods --seq_dict Path to sequence dictionary file @@ -54,8 +55,8 @@ def helpMessage() { --complexity_filter_poly_g_min Specify poly-g min filter (default: 10) for filtering Clipping / Merging - --clip_forward_adaptor Specify adapter to be clipped off (forward) - --clip_reverse_adaptor Specify adapter to be clipped off (reverse) + --clip_forward_adaptor Specify adapter sequence to be clipped off (forward) + --clip_reverse_adaptor Specify adapter sequence to be clipped off (reverse) --clip_readlength Specify read minimum length to be kept for downstream analysis --clip_min_read_quality Specify minimum base quality for not trimming off bases --min_adap_overlap Specify minimum adapter overlap @@ -75,9 +76,9 @@ def helpMessage() { --bwamem Turn on BWA Mem instead of CM/BWA aln for mapping BAM Filtering - --bam_keep_mapped_only Only consider mapped reads for downstream analysis. Unmapped reads are extracted to separate output. - --bam_filter_reads Keep all reads in BAM file for downstream analysis - --bam_mapping_quality_threshold Minimum mapping quality for reads filter + --bam_discard_unmapped Discards unmapped reads in either FASTQ or BAM format, depending on choice. + --bam_unmapped_type Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format (options: discard, bam, fastq, both). + --bam_mapping_quality_threshold Minimum mapping quality for reads filter, default 0. DeDuplication --dedupper Deduplication method to use @@ -172,11 +173,12 @@ params.circularfilter = false params.bwamem = false //BAM Filtering steps (default = keep mapped and unmapped in BAM file) -params.bam_keep_mapped_only = false -params.bam_keep_all = true -params.bam_filter_reads = false +params.bam_discard_unmapped = false +params.bam_unmapped_type = '' + params.bam_mapping_quality_threshold = 0 + //DamageProfiler settings params.damageprofiler_length = 100 params.damageprofiler_threshold = 15 @@ -210,9 +212,24 @@ wherearemyfiles = file("$baseDir/assets/where_are_my_files.txt") // Validate inputs Channel.fromPath("${params.fasta}") - .ifEmpty { exit 1, "No genome specified! Please specify one with --fasta or --bwa_index"} + .ifEmpty { exit 1, "No genome specified! Please specify one with --fasta"} .into {ch_fasta_for_bwa_indexing;ch_fasta_for_faidx_indexing;ch_fasta_for_dict_indexing; ch_fasta_for_bwa_mapping; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_circularmapper; ch_fasta_for_circularmapper_index;ch_fasta_for_bwamem_mapping} +//Index files provided? Then check whether they are correct and complete +if (params.aligner != 'bwa' && !params.circularmapper && !params.bwamem){ + exit 1, "Invalid aligner option. Default is bwa, but specify --circularmapper or --bwamem to use these." +} +if( params.bwa_index && (params.aligner == 'bwa' | params.bwamem)){ + bwa_index = Channel + .fromPath("${params.bwa_index}/**.*") + .ifEmpty { exit 1, "BWA index not found: ${params.bwa_index}" } + .into{ch_bwa_index_existing;ch_bwa_index_bwamem_existing} +} else { + //Create empty channels to make sure later mix() does not fail + ch_bwa_index_existing = Channel.empty() + ch_bwa_index_bwamem_existing = Channel.empty() +} + //Validate that either pairedEnd or singleEnd has been specified by the user! if( params.singleEnd || params.pairedEnd ){ } else { @@ -271,7 +288,7 @@ if(params.readPaths){ // Header log info log.info "=========================================" -log.info " nf-core/eager v${params.pipelineVersion}" +log.info " nf-core/eager v${workflow.manifest.version}" log.info "=========================================" def summary = [:] summary['Pipeline Name'] = 'nf-core/eager' @@ -279,6 +296,7 @@ summary['Pipeline Version'] = workflow.manifest.version summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads summary['Fasta Ref'] = params.fasta +if(params.bwa_index) summary['BWA Index'] = params.bwa_index summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus @@ -363,7 +381,7 @@ process makeBWAIndex { else null } - when: !params.bwa_index && params.fasta && params.aligner == 'bwa' + when: !params.bwa_index && params.fasta && (params.aligner == 'bwa' || params.bwamem) input: file fasta from ch_fasta_for_bwa_indexing @@ -560,7 +578,7 @@ process bwa { input: file(reads) from ch_clipped_reads - file "*" from ch_bwa_index + file "*" from ch_bwa_index.mix(ch_bwa_index_existing).collect() file fasta from ch_fasta_for_bwa_mapping output: @@ -573,7 +591,7 @@ process bwa { """ bwa aln -t ${task.cpus} $fasta $reads -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f "${reads.baseName}.sai" bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" $fasta "${reads.baseName}".sai $reads | samtools sort -@ ${task.cpus} -O bam - > "${prefix}".sorted.bam - samtools index -@ ${task.cpus} "${prefix}".sorted.bam + samtools index "${prefix}".sorted.bam """ } @@ -625,7 +643,7 @@ process circularmapper{ bwa samse -r "@RG\\tID:ILLUMINA-${prefix}\\tSM:${prefix}\\tPL:illumina" "${fasta.baseName}_${params.circularextension}.fasta" "${reads.baseName}".sai $reads > tmp.out realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${prefix}".sorted.bam - samtools index -@ ${task.cpus} "${prefix}".sorted.bam + samtools index "${prefix}".sorted.bam """ } @@ -637,7 +655,7 @@ process bwamem { input: file(reads) from ch_clipped_reads_bwamem - file "*" from ch_bwa_index_bwamem + file "*" from ch_bwa_index_bwamem.mix(ch_bwa_index_bwamem_existing).collect() file fasta from ch_fasta_for_bwamem_mapping output: @@ -694,26 +712,39 @@ process samtools_filter { output: file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk - file "*.fq.gz" optional true + file "*.fastq.gz" optional true file "*.unmapped.bam" optional true file "*.bai" - when: "${params.bam_filter_reads}" - script: prefix="$bam" - ~/(\.bam)?/ - - if("${params.bam_keep_mapped_only}"){ - """ - samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools fastq -tn "${prefix}.unmapped.bam" | gzip > "${prefix}.unmapped.fq.gz" - samtools index -@ ${task.cpus} ${prefix}.filtered.bam - """ - } else { - """ - samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) - samtools index -@ ${task.cpus} ${prefix}.filtered.bam - """ + + if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "discard"){ + """ + samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam + """ + } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "bam"){ + """ + samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) + samtools index ${prefix}.filtered.bam + """ + } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "fastq"){ + """ + samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) + samtools index ${prefix}.filtered.bam + samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz" + rm ${prefix}.unmapped.bam + """ + } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "both"){ + """ + samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) + samtools index ${prefix}.filtered.bam + samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz" + """ + } else { //Only apply quality filtering, default + """ + samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam + """ } } @@ -747,14 +778,14 @@ process dedup{ dedup -i $bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "$prefix"_rmdup.bam -o "$prefix".sorted.bam - samtools index -@ ${task.cpus} "$prefix".sorted.bam + samtools index "$prefix".sorted.bam """ } else { """ dedup -i $bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "$prefix"_rmdup.bam -o "$prefix".sorted.bam - samtools index -@ ${task.cpus} "$prefix".sorted.bam + samtools index "$prefix".sorted.bam """ } } @@ -862,15 +893,15 @@ process markDup{ script: prefix = "${bam.baseName}" """ - picard MarkDuplicates INPUT=$bam OUTPUT=${prefix}.markDup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE=${prefix}.markdup.metrics" VALIDATION_STRINGENCY=SILENT + picard MarkDuplicates INPUT=$bam OUTPUT=${prefix}.markDup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${prefix}.markdup.metrics" VALIDATION_STRINGENCY=SILENT """ } //If no deduplication runs, the input is mixed directly from samtools filter, if it runs either markdup or dedup is used thus mixed from these two channels -ch_dedup_for_pmdtools = Channel.create() +ch_dedup_for_pmdtools = Channel.empty() //Bamutils TrimBam Channel -ch_for_bamutils = Channel.create() +ch_for_bamutils = Channel.empty() if(!params.skip_deduplication){ ch_dedup_for_pmdtools.mix(ch_markdup_bam,ch_dedup_bam).into {ch_for_pmdtools;ch_for_bamutils} diff --git a/nextflow.config b/nextflow.config index 3d7e9e93c..4a4c5e0c9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,8 +10,7 @@ // Global default params, used in configs params { - pipelineVersion = '2.0.0dev' // Pipeline version - container = 'nfcore/eager:2.0.2' + container = 'nfcore/eager:latest' //Pipeline options aligner = 'bwa' @@ -59,6 +58,14 @@ profiles { includeConfig 'conf/base.config' includeConfig 'conf/uzh.config' } + phoenix { + includeConfig 'conf/base.config' + includeConfig 'conf/acad-pheonix.config' + } + shh { + includeConfig 'conf/base.config' + includeConfig 'conf/shh.config' + } cfc { includeConfig 'conf/base.config' includeConfig 'conf/cfc.config' @@ -100,7 +107,7 @@ manifest { name = 'nf-core/eager' author = 'Alexander Peltzer, Stephen Clayton, James A Fellows-Yates' homePage = 'https://github.com/nf-core/eager' - version = '2.0.2' + version = '2.0.3dev' description = 'A fully reproducible and modern ancient DNA pipeline in Nextflow and with cloud support.' mainScript = 'main.nf' nextflowVersion = '>=0.32.0'