Merge pull request #111 from apeltzer/zip_fasta

Enable gzipped FastA input as reference genome
nf-core · Dec 17, 2018 · 2a7e70e · 2a7e70e
2 parents 1f38063 + 16d4412
commit 2a7e70e
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 4 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -48,5 +48,7 @@ script:
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --circularmapper --circulartarget 'NC_007596.2'
   # Test running with BWA Mem
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --bwamem --bwa_index results/reference_genome/bwa_index/
+  # Test with zipped reference input
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --fasta 'https://raw.githubusercontent.com/nf-core/test-datasets/eager2/reference/Test.fasta.gz'
   # Test basic pipeline with Conda too 
   - travis_wait 25 nextflow run ${TRAVIS_BUILD_DIR} -profile test,conda --pairedEnd --bwa_index results/reference_genome/bwa_index/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,9 +6,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unpublished / Dev Branch]
 
-### `Added` 
+### `Added`
+* [#111](https://github.com/nf-core/eager/pull/110) - Allow [Zipped FastA reference input](https://github.com/nf-core/eager/issues/91)
 * [#113](https://github.com/nf-core/eager/pull/113) - All files are now staged via channels, which is considered best practice by Nextflow. 
 
+
 ### `Fixed`
 * [#110](https://github.com/nf-core/eager/pull/110) - Fix for [MultiQC Missing Second FastQC report](https://github.com/nf-core/eager/issues/107)
 

diff --git a/docs/configuration/reference_genomes.md b/docs/configuration/reference_genomes.md
@@ -10,7 +10,6 @@ Read [Adding your own system](adding_your_own.md) to find out how to set up cust
 ## Adding paths to a config file
 Specifying long paths every time you run the pipeline is a pain.
 To make this easier, the pipeline comes configured to understand reference genome keywords which correspond to preconfigured paths, meaning that you can just specify `--genome ID` when running the pipeline.
->>>>>>> TEMPLATE
 
 Note that this genome key can also be specified in a config file if you always use the same genome.
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -136,7 +136,7 @@ If you prefer, you can specify the full path to your reference genome when you r
 ```bash
 --fasta '[path to Fasta reference]'
 ```
-> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`.
+> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`. You may also specify the path to a gzipped (`*.gz` file extension) FastA as reference genome - this will be uncompressed by the pipeline automatically for you.
 
 ### `--genome` (using iGenomes)
 

diff --git a/main.nf b/main.nf
@@ -217,9 +217,37 @@ Channel.fromPath("$baseDir/assets/where_are_my_files.txt")
        .into{ ch_where_for_bwa_index; ch_where_for_fasta_index; ch_where_for_seqdict}
 
 // Validate inputs
-Channel.fromPath("${params.fasta}")
+if("${params.fasta}".endsWith(".gz")){
+    //Put the zip into a channel, then unzip it and forward to downstream processes. DONT unzip in all steps, this is inefficient as NXF links the files anyways from work to work dir
+    Channel.fromPath("${params.fasta}")
+            .ifEmpty { exit 1, "No genome specified! Please specify one with --fasta"}
+            .set {ch_unzip_fasta}
+
+    process unzip_reference{
+        tag "$zipfasta"
+
+        input:
+        file zipfasta from ch_unzip_fasta
+
+        output:
+        file "*.fasta" into (ch_fasta_for_bwa_indexing, ch_fasta_for_faidx_indexing, ch_fasta_for_dict_indexing,  ch_fasta_for_bwa_mapping, ch_fasta_for_damageprofiler, ch_fasta_for_qualimap, ch_fasta_for_pmdtools, ch_fasta_for_circularmapper, ch_fasta_for_circularmapper_index,ch_fasta_for_bwamem_mapping)
+
+        script:
+        """
+        pigz -f -d -p ${task.cpus} $zipfasta
+        """
+    }   
+    } else {
+    Channel.fromPath("${params.fasta}")
     .ifEmpty { exit 1, "No genome specified! Please specify one with --fasta"}
     .into {ch_fasta_for_bwa_indexing;ch_fasta_for_faidx_indexing;ch_fasta_for_dict_indexing; ch_fasta_for_bwa_mapping; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_circularmapper; ch_fasta_for_circularmapper_index;ch_fasta_for_bwamem_mapping}
+}
+
+
+
+
+
+
 
 //Index files provided? Then check whether they are correct and complete
 if (params.aligner != 'bwa' && !params.circularmapper && !params.bwamem){