Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speed improvements #31

Merged
merged 4 commits into from
May 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ jobs:
environment.yml
- name: Build new docker image
if: env.MATCHED_FILES
run: docker build --no-cache . -t qbicpipelines/bamtofastq:1.1.0
run: docker build --no-cache . -t qbicpipelines/bamtofastq:dev

- name: Pull docker image
if: ${{ !env.MATCHED_FILES }}
run: |
docker pull qbicpipelines/bamtofastq:dev
docker tag qbicpipelines/bamtofastq:dev qbicpipelines/bamtofastq:1.1.0
docker tag qbicpipelines/bamtofastq:dev qbicpipelines/bamtofastq:dev
- name: Install Nextflow
run: |
wget -qO- get.nextflow.io | bash
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# nf-core/bamtofastq: Changelog

## v1.1.1 - Katherine Johnson faster

- [#31](https://github.com/qbic-pipelines/bamtofastq/pull/31) Add option `--samtools-collate-fast` and improve speed of cat.

## v1.1.0 - Katherine Johnson

- [#21](https://github.com/qbic-pipelines/bamtofastq/21) Allows bam indices as additional input files
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ LABEL authors="Friederike Hanssen" \

COPY environment.yml /
RUN conda env create -f /environment.yml && conda clean -a
ENV PATH /opt/conda/envs/qbic-pipelines-bamtofastq-1.1.0/bin:$PATH
ENV PATH /opt/conda/envs/qbic-pipelines-bamtofastq-1.1.1dev/bin:$PATH
2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ process {
}
withLabel:process_high {
cpus = { check_max( 15 * task.attempt, 'cpus' ) }
memory = { check_max( 120.GB * task.attempt, 'memory' ) }
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
time = { check_max( 10.h * task.attempt, 'time' ) }
}
withLabel:process_long {
Expand Down
4 changes: 4 additions & 0 deletions conf/test_bai.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ params {
max_cpus = 2
max_memory = 6.GB
max_time = 48.h
samtools_collate_fast = true
no_stats = true
no_read_QC = true


index_files = true
input_paths = [
Expand Down
5 changes: 5 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* [`--index_files`](#--index_files)
* [`--chr`](#--chr)
* [`--no_read_QC`](#--no_read_QC)
* [`--samtools_collate_fast`](#--samtools_collate_fast)
* [`--no_stats`](#--no_stats)
* [Job resources](#job-resources)
* [Automatic resubmission](#automatic-resubmission)
Expand Down Expand Up @@ -154,6 +155,10 @@ Use to skip `FastQC` on obtained reads. This is useful, when the reads are used
--no_read_QC
```

### `--samtools_collate_fast` (optional)

Use to specify the fast mode for the `samtools collate` command in the process `sortExtractMapped`.

### `--no_stats` (optional)

Use to skip `FastQC` on both input bam and output reads, as well as all `samtools flagstat`, `samtools idxstats`, and `samtools stats`. This is useful for large datasets, since the quality metrics processes require a significant amount of time and resources.
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# You can use this file to create a conda environment for this pipeline:
# conda env create -f environment.yml
name: qbic-pipelines-bamtofastq-1.1.0
name: qbic-pipelines-bamtofastq-1.1.1dev
channels:
- conda-forge
- bioconda
Expand Down
26 changes: 15 additions & 11 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,9 @@ process checkIfPairedEnd{

output:
set val(name), file(bam), file(bai), file('*paired.txt') optional true into bam_files_paired_map_map,
bam_files_paired_unmap_unmap,
bam_files_paired_unmap_map,
bam_files_paired_map_unmap
bam_files_paired_unmap_unmap,
bam_files_paired_unmap_map,
bam_files_paired_map_unmap
set val(name), file(bam), file(bai), file('*single.txt') optional true into bam_file_single_end // = is not paired end

//Take samtools header + the first 1000 reads (to safe time, otherwise also all can be used) and check whether for
Expand Down Expand Up @@ -443,8 +443,8 @@ process pairedEndMapUnmap{
}

unmap_unmap_bam.join(map_unmap_bam, remainder: true)
.join(unmap_map_bam, remainder: true)
.set{ all_unmapped_bam }
.join(unmap_map_bam, remainder: true)
.set{ all_unmapped_bam }

process mergeUnmapped{
tag "$name"
Expand Down Expand Up @@ -489,9 +489,10 @@ process sortExtractUnmapped{
set val(name), file('*_unmapped.fq.gz') into reads_unmapped

script:
def collate_fast = params.samtools_collate_fast ? "-f -r 100000" : ""
"""
samtools collate -O -@$task.cpus $all_unmapped . \
| samtools fastq -1 ${name}_R1_unmapped.fq.gz -2 ${name}_R2_unmapped.fq.gz -s ${name}_unmapped_singletons.fq.gz -N -@$task.cpus
samtools collate -O -@$task.cpus $collate_fast $all_unmapped . \
| samtools fastq -1 ${name}_R1_unmapped.fq.gz -2 ${name}_R2_unmapped.fq.gz -s ${name}_unmapped_singletons.fq.gz -N -@$task.cpus
"""
}

Expand Down Expand Up @@ -519,8 +520,10 @@ process joinMappedAndUnmappedFastq{

script:
"""
cat $mapped_fq1 $unmapped_fq1 > ${name}.1.fq.gz
cat $mapped_fq2 $unmapped_fq2 > ${name}.2.fq.gz
cat $unmapped_fq1 >> $mapped_fq1
mv $mapped_fq1 ${name}.1.fq.gz
cat $unmapped_fq2 > $mapped_fq2
mv $mapped_fq2 ${name}.2.fq.gz
"""
}

Expand Down Expand Up @@ -568,9 +571,10 @@ process sortExtractSingleEnd{
txt.exists()

script:
def collate_fast = params.samtools_collate_fast ? "-f -r 100000" : ""
"""
samtools collate -O -@$task.cpus $bam . \
| samtools fastq -0 ${name}.singleton.fq.gz -N -@$task.cpus
samtools collate -O -@$task.cpus $collate_fast $bam . \
| samtools fastq -0 ${name}.singleton.fq.gz -N -@$task.cpus
"""
}

Expand Down
5 changes: 3 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ params {
index_files = false
no_stats = false
no_read_QC = false //By default: QC is perfored on extrcted reads
samtools_collate_fast = false
outdir = './results'

// Boilerplate options
Expand All @@ -38,7 +39,7 @@ params {

// Container slug. Stable releases should specify release tag!
// Developmental code should specify :dev
process.container = 'qbicpipelines/bamtofastq:1.1.0'
process.container = 'qbicpipelines/bamtofastq:dev'

// Load base.config by default for all pipelines
includeConfig 'conf/base.config'
Expand Down Expand Up @@ -93,7 +94,7 @@ manifest {
description = 'Workflow converts one or multiple bam files back to the fastq format'
mainScript = 'main.nf'
nextflowVersion = '>=20.04.1'
version = '1.1.0'
version = '1.1.1dev'
}

// Function to ensure that resource requirements don't go beyond
Expand Down