From e1a3bafaafa5eac730b45134f43650311cf11389 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Wed, 5 May 2021 16:02:52 +0200 Subject: [PATCH 1/4] bump-versions-dev --- .github/workflows/ci.yml | 4 ++-- Dockerfile | 2 +- environment.yml | 2 +- nextflow.config | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4189dd14..1cb73efa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,13 +33,13 @@ jobs: environment.yml - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t qbicpipelines/bamtofastq:1.1.0 + run: docker build --no-cache . -t qbicpipelines/bamtofastq:dev - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull qbicpipelines/bamtofastq:dev - docker tag qbicpipelines/bamtofastq:dev qbicpipelines/bamtofastq:1.1.0 + docker tag qbicpipelines/bamtofastq:dev qbicpipelines/bamtofastq:dev - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash diff --git a/Dockerfile b/Dockerfile index 2cbcbe3b..3e2c1aca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,4 +4,4 @@ LABEL authors="Friederike Hanssen" \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/qbic-pipelines-bamtofastq-1.1.0/bin:$PATH +ENV PATH /opt/conda/envs/qbic-pipelines-bamtofastq-1.1.1dev/bin:$PATH diff --git a/environment.yml b/environment.yml index 9d9a5178..2252a3de 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: qbic-pipelines-bamtofastq-1.1.0 +name: qbic-pipelines-bamtofastq-1.1.1dev channels: - conda-forge - bioconda diff --git a/nextflow.config b/nextflow.config index aea5f668..670c7530 100644 --- a/nextflow.config +++ b/nextflow.config @@ -38,7 +38,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'qbicpipelines/bamtofastq:1.1.0' +process.container = 'qbicpipelines/bamtofastq:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -93,7 +93,7 @@ manifest { description = 'Workflow converts one or multiple bam files back to the fastq format' mainScript = 'main.nf' nextflowVersion = '>=20.04.1' - version = '1.1.0' + version = '1.1.1dev' } // Function to ensure that resource requirements don't go beyond From f6c2ef964801cd3bc843beee4c9118a7d1080acf Mon Sep 17 00:00:00 2001 From: ggabernet Date: Wed, 5 May 2021 16:21:38 +0200 Subject: [PATCH 2/4] samtools collate fast and cat improvements --- conf/base.config | 2 +- docs/usage.md | 5 +++++ main.nf | 26 +++++++++++++++----------- nextflow.config | 1 + 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/conf/base.config b/conf/base.config index d796f6f8..eb70fb46 100644 --- a/conf/base.config +++ b/conf/base.config @@ -36,7 +36,7 @@ process { } withLabel:process_high { cpus = { check_max( 15 * task.attempt, 'cpus' ) } - memory = { check_max( 120.GB * task.attempt, 'memory' ) } + memory = { check_max( 200.GB * task.attempt, 'memory' ) } time = { check_max( 10.h * task.attempt, 'time' ) } } withLabel:process_long { diff --git a/docs/usage.md b/docs/usage.md index 55ab6dd5..b519d4c6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -15,6 +15,7 @@ * [`--index_files`](#--index_files) * [`--chr`](#--chr) * [`--no_read_QC`](#--no_read_QC) + * [`--samtools_collate_fast`](#--samtools_collate_fast) * [`--no_stats`](#--no_stats) * [Job resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) @@ -154,6 +155,10 @@ Use to skip `FastQC` on obtained reads. This is useful, when the reads are used --no_read_QC ``` +### `--samtools_collate_fast` (optional) + +Use to specify the fast mode for the `samtools collate` command in the process `sortExtractMapped`. + ### `--no_stats` (optional) Use to skip `FastQC` on both input bam and output reads, as well as all `samtools flagstat`, `samtools idxstats`, and `samtools stats`. This is useful for large datasets, since the quality metrics processes require a significant amount of time and resources. diff --git a/main.nf b/main.nf index 964abb5b..c549853f 100644 --- a/main.nf +++ b/main.nf @@ -349,9 +349,9 @@ process checkIfPairedEnd{ output: set val(name), file(bam), file(bai), file('*paired.txt') optional true into bam_files_paired_map_map, - bam_files_paired_unmap_unmap, - bam_files_paired_unmap_map, - bam_files_paired_map_unmap + bam_files_paired_unmap_unmap, + bam_files_paired_unmap_map, + bam_files_paired_map_unmap set val(name), file(bam), file(bai), file('*single.txt') optional true into bam_file_single_end // = is not paired end //Take samtools header + the first 1000 reads (to safe time, otherwise also all can be used) and check whether for @@ -443,8 +443,8 @@ process pairedEndMapUnmap{ } unmap_unmap_bam.join(map_unmap_bam, remainder: true) - .join(unmap_map_bam, remainder: true) - .set{ all_unmapped_bam } + .join(unmap_map_bam, remainder: true) + .set{ all_unmapped_bam } process mergeUnmapped{ tag "$name" @@ -489,9 +489,10 @@ process sortExtractUnmapped{ set val(name), file('*_unmapped.fq.gz') into reads_unmapped script: + def collate_fast = params.samtools_collate_fast ? "-f -r 100000" : "" """ - samtools collate -O -@$task.cpus $all_unmapped . \ - | samtools fastq -1 ${name}_R1_unmapped.fq.gz -2 ${name}_R2_unmapped.fq.gz -s ${name}_unmapped_singletons.fq.gz -N -@$task.cpus + samtools collate -O -@$task.cpus $collate_fast $all_unmapped . \ + | samtools fastq -1 ${name}_R1_unmapped.fq.gz -2 ${name}_R2_unmapped.fq.gz -s ${name}_unmapped_singletons.fq.gz -N -@$task.cpus """ } @@ -519,8 +520,10 @@ process joinMappedAndUnmappedFastq{ script: """ - cat $mapped_fq1 $unmapped_fq1 > ${name}.1.fq.gz - cat $mapped_fq2 $unmapped_fq2 > ${name}.2.fq.gz + cat $unmapped_fq1 >> $mapped_fq1 + mv $mapped_fq1 ${name}.1.fq.gz + cat $unmapped_fq2 > $mapped_fq2 + mv $mapped_fq2 ${name}.2.fq.gz """ } @@ -568,9 +571,10 @@ process sortExtractSingleEnd{ txt.exists() script: + def collate_fast = params.samtools_collate_fast ? "-f -r 100000" : "" """ - samtools collate -O -@$task.cpus $bam . \ - | samtools fastq -0 ${name}.singleton.fq.gz -N -@$task.cpus + samtools collate -O -@$task.cpus $collate_fast $bam . \ + | samtools fastq -0 ${name}.singleton.fq.gz -N -@$task.cpus """ } diff --git a/nextflow.config b/nextflow.config index 670c7530..5a504c14 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { index_files = false no_stats = false no_read_QC = false //By default: QC is perfored on extrcted reads + samtools_collate_fast = false outdir = './results' // Boilerplate options From da3604e36ce12f471d5df136626f11337be05750 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Wed, 5 May 2021 16:25:47 +0200 Subject: [PATCH 3/4] update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83357d7e..5883396b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # nf-core/bamtofastq: Changelog +## v1.1.1 - Katherine Johnson faster + +- [#31](https://github.com/qbic-pipelines/bamtofastq/pull/31) Add option `--samtools-collate-fast` and improve speed of cat. + ## v1.1.0 - Katherine Johnson - [#21](https://github.com/qbic-pipelines/bamtofastq/21) Allows bam indices as additional input files From ca13dd2beb851d27e221635d76e219c1cc02f4d9 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Wed, 5 May 2021 16:27:44 +0200 Subject: [PATCH 4/4] add new params to test --- conf/test_bai.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conf/test_bai.config b/conf/test_bai.config index 155ada5c..bc6d5b1f 100644 --- a/conf/test_bai.config +++ b/conf/test_bai.config @@ -15,6 +15,10 @@ params { max_cpus = 2 max_memory = 6.GB max_time = 48.h + samtools_collate_fast = true + no_stats = true + no_read_QC = true + index_files = true input_paths = [