From e1a3bafaafa5eac730b45134f43650311cf11389 Mon Sep 17 00:00:00 2001
From: ggabernet <gisela.gabernet@qbic.uni-tuebingen.de>
Date: Wed, 5 May 2021 16:02:52 +0200
Subject: [PATCH 1/4] bump-versions-dev

---
 .github/workflows/ci.yml | 4 ++--
 Dockerfile               | 2 +-
 environment.yml          | 2 +-
 nextflow.config          | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4189dd14..1cb73efa 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,13 +33,13 @@ jobs:
             environment.yml
       - name: Build new docker image
         if: env.MATCHED_FILES
-        run: docker build --no-cache . -t qbicpipelines/bamtofastq:1.1.0
+        run: docker build --no-cache . -t qbicpipelines/bamtofastq:dev
 
       - name: Pull docker image
         if: ${{ !env.MATCHED_FILES }}
         run: |
           docker pull qbicpipelines/bamtofastq:dev
-          docker tag qbicpipelines/bamtofastq:dev qbicpipelines/bamtofastq:1.1.0
+          docker tag qbicpipelines/bamtofastq:dev qbicpipelines/bamtofastq:dev
       - name: Install Nextflow
         run: |
           wget -qO- get.nextflow.io | bash
diff --git a/Dockerfile b/Dockerfile
index 2cbcbe3b..3e2c1aca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,4 +4,4 @@ LABEL authors="Friederike Hanssen" \
 
 COPY environment.yml /
 RUN conda env create -f /environment.yml && conda clean -a
-ENV PATH /opt/conda/envs/qbic-pipelines-bamtofastq-1.1.0/bin:$PATH
+ENV PATH /opt/conda/envs/qbic-pipelines-bamtofastq-1.1.1dev/bin:$PATH
diff --git a/environment.yml b/environment.yml
index 9d9a5178..2252a3de 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,6 @@
 # You can use this file to create a conda environment for this pipeline:
 #   conda env create -f environment.yml
-name: qbic-pipelines-bamtofastq-1.1.0
+name: qbic-pipelines-bamtofastq-1.1.1dev
 channels:
   - conda-forge
   - bioconda
diff --git a/nextflow.config b/nextflow.config
index aea5f668..670c7530 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -38,7 +38,7 @@ params {
 
 // Container slug. Stable releases should specify release tag!
 // Developmental code should specify :dev
-process.container = 'qbicpipelines/bamtofastq:1.1.0'
+process.container = 'qbicpipelines/bamtofastq:dev'
 
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
@@ -93,7 +93,7 @@ manifest {
   description = 'Workflow converts one or multiple bam files back to the fastq format'
   mainScript = 'main.nf'
   nextflowVersion = '>=20.04.1'
-  version = '1.1.0'
+  version = '1.1.1dev'
 }
 
 // Function to ensure that resource requirements don't go beyond

From f6c2ef964801cd3bc843beee4c9118a7d1080acf Mon Sep 17 00:00:00 2001
From: ggabernet <gisela.gabernet@qbic.uni-tuebingen.de>
Date: Wed, 5 May 2021 16:21:38 +0200
Subject: [PATCH 2/4] samtools collate fast and cat improvements

---
 conf/base.config |  2 +-
 docs/usage.md    |  5 +++++
 main.nf          | 26 +++++++++++++++-----------
 nextflow.config  |  1 +
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index d796f6f8..eb70fb46 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -36,7 +36,7 @@ process {
   }
   withLabel:process_high {
     cpus = { check_max( 15 * task.attempt, 'cpus' ) }
-    memory = { check_max( 120.GB * task.attempt, 'memory' ) }
+    memory = { check_max( 200.GB * task.attempt, 'memory' ) }
     time = { check_max( 10.h * task.attempt, 'time' ) }
   }
   withLabel:process_long {
diff --git a/docs/usage.md b/docs/usage.md
index 55ab6dd5..b519d4c6 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -15,6 +15,7 @@
   * [`--index_files`](#--index_files)
   * [`--chr`](#--chr)
   * [`--no_read_QC`](#--no_read_QC)
+  * [`--samtools_collate_fast`](#--samtools_collate_fast)
   * [`--no_stats`](#--no_stats)
 * [Job resources](#job-resources)
   * [Automatic resubmission](#automatic-resubmission)
@@ -154,6 +155,10 @@ Use to skip `FastQC` on obtained reads. This is useful, when the reads are used
 --no_read_QC
 ```
 
+### `--samtools_collate_fast` (optional)
+
+Use to specify the fast mode for the `samtools collate` command in the process `sortExtractMapped`.
+
 ### `--no_stats` (optional)
 
 Use to skip `FastQC` on both input bam and output reads, as well as all `samtools flagstat`, `samtools idxstats`, and `samtools stats`. This is useful for large datasets, since the quality metrics processes require a significant amount of time and resources.
diff --git a/main.nf b/main.nf
index 964abb5b..c549853f 100644
--- a/main.nf
+++ b/main.nf
@@ -349,9 +349,9 @@ process checkIfPairedEnd{
 
   output:
   set val(name), file(bam), file(bai), file('*paired.txt') optional true into bam_files_paired_map_map,
-                                                                   bam_files_paired_unmap_unmap,
-                                                                   bam_files_paired_unmap_map,
-                                                                   bam_files_paired_map_unmap
+                                                                              bam_files_paired_unmap_unmap,
+                                                                              bam_files_paired_unmap_map,
+                                                                              bam_files_paired_map_unmap
   set val(name), file(bam), file(bai), file('*single.txt') optional true into bam_file_single_end // = is not paired end
 
   //Take samtools header + the first 1000 reads (to safe time, otherwise also all can be used) and check whether for
@@ -443,8 +443,8 @@ process pairedEndMapUnmap{
 }
 
 unmap_unmap_bam.join(map_unmap_bam, remainder: true)
-               .join(unmap_map_bam, remainder: true)
-               .set{ all_unmapped_bam }
+                .join(unmap_map_bam, remainder: true)
+                .set{ all_unmapped_bam }
 
 process mergeUnmapped{
   tag "$name"
@@ -489,9 +489,10 @@ process sortExtractUnmapped{
   set val(name), file('*_unmapped.fq.gz') into reads_unmapped
 
   script:
+  def collate_fast = params.samtools_collate_fast ? "-f -r 100000" : ""
   """
-  samtools collate -O -@$task.cpus $all_unmapped . \
-     | samtools fastq -1 ${name}_R1_unmapped.fq.gz -2 ${name}_R2_unmapped.fq.gz -s ${name}_unmapped_singletons.fq.gz -N -@$task.cpus
+  samtools collate -O -@$task.cpus $collate_fast $all_unmapped . \
+      | samtools fastq -1 ${name}_R1_unmapped.fq.gz -2 ${name}_R2_unmapped.fq.gz -s ${name}_unmapped_singletons.fq.gz -N -@$task.cpus
   """
 }
 
@@ -519,8 +520,10 @@ process joinMappedAndUnmappedFastq{
 
   script:
   """
-  cat $mapped_fq1 $unmapped_fq1 > ${name}.1.fq.gz
-  cat $mapped_fq2 $unmapped_fq2 > ${name}.2.fq.gz
+  cat $unmapped_fq1 >> $mapped_fq1
+  mv $mapped_fq1 ${name}.1.fq.gz
+  cat $unmapped_fq2 > $mapped_fq2
+  mv $mapped_fq2 ${name}.2.fq.gz
   """
 }
 
@@ -568,9 +571,10 @@ process sortExtractSingleEnd{
     txt.exists()
 
     script:
+    def collate_fast = params.samtools_collate_fast ? "-f -r 100000" : ""
     """
-    samtools collate -O -@$task.cpus $bam . \
-     | samtools fastq -0 ${name}.singleton.fq.gz -N -@$task.cpus
+    samtools collate -O -@$task.cpus $collate_fast $bam . \
+      | samtools fastq -0 ${name}.singleton.fq.gz -N -@$task.cpus
     """
  }
 
diff --git a/nextflow.config b/nextflow.config
index 670c7530..5a504c14 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -14,6 +14,7 @@ params {
   index_files = false
   no_stats = false
   no_read_QC = false //By default: QC is perfored on extrcted reads
+  samtools_collate_fast = false
   outdir = './results'
 
   // Boilerplate options

From da3604e36ce12f471d5df136626f11337be05750 Mon Sep 17 00:00:00 2001
From: ggabernet <gisela.gabernet@qbic.uni-tuebingen.de>
Date: Wed, 5 May 2021 16:25:47 +0200
Subject: [PATCH 3/4] update changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 83357d7e..5883396b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # nf-core/bamtofastq: Changelog
 
+## v1.1.1 - Katherine Johnson faster
+
+- [#31](https://github.com/qbic-pipelines/bamtofastq/pull/31) Add option `--samtools-collate-fast` and improve speed of cat.
+
 ## v1.1.0 -  Katherine Johnson
 
 - [#21](https://github.com/qbic-pipelines/bamtofastq/21) Allows bam indices as additional input files

From ca13dd2beb851d27e221635d76e219c1cc02f4d9 Mon Sep 17 00:00:00 2001
From: ggabernet <gisela.gabernet@qbic.uni-tuebingen.de>
Date: Wed, 5 May 2021 16:27:44 +0200
Subject: [PATCH 4/4] add new params to test

---
 conf/test_bai.config | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/conf/test_bai.config b/conf/test_bai.config
index 155ada5c..bc6d5b1f 100644
--- a/conf/test_bai.config
+++ b/conf/test_bai.config
@@ -15,6 +15,10 @@ params {
   max_cpus = 2
   max_memory = 6.GB
   max_time = 48.h
+  samtools_collate_fast = true
+  no_stats = true
+  no_read_QC = true
+  
 
   index_files = true
   input_paths = [