Merge pull request #87 from Eco-Flow/update_readme

Updated README
nf-core · Nov 13, 2024 · 64a7fdf · 64a7fdf
2 parents a26730a + 9be9c72
commit 64a7fdf
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#intr
 
 In addition to the three different modes described above, it is also possible to run the pipeline with or without sequencing reads. When supplying sequencing reads, Merqury can also be run. [Merqury](https://github.com/marbl/merqury) is a tool for genome quality assessment that uses k-mer counts from raw sequencing data to evaluate the accuracy and completeness of a genome assembly. Meryl is the companion tool that efficiently counts and stores k-mers from sequencing reads, enabling Merqury to estimate metrics like assembly completeness and base accuracy. These tools provide a k-mer-based approach to assess assembly quality, helping to identify potential errors or gaps.
 
-To run the pipeline with reads, you must supply a single FASTQ file for each genome in the samplesheet. It is assumed that reads used to create the assembly are from long read technology such as PacBio or ONT, and are therefore single end. If reads are in a .bam file, they must be converted to FASTQ format first. If you have paired end reads, these must be interleaved first.
+To run the pipeline with reads, you must supply a single FASTQ file for each genome in the samplesheet, alongside the `--run_merqury` flag. It is assumed that reads used to create the assembly are from long read technology such as PacBio or ONT, and are therefore single end. If reads are in a .bam file, they must be converted to FASTQ format first. If you have paired end reads, these must be interleaved first.
 
 ## Usage
 
@@ -61,36 +61,34 @@ First, prepare a `samplesheet.csv`, where your input data points to genomes + or
 
 ```csv
 species,refseq,fasta,gff,fastq
-Homo_sapiens,,/path/to/genome.fasta,/path/to/annotation.gff3,/path/to/reads.fq.gz
-Gorilla_gorilla,,/path/to/genome.fasta,,/path/to/reads.fq.gz
-Pan_paniscus,,/path/to/genome.fasta,/path/to/annotation.gff3,/path/to/reads.fq.gz
+Homo_sapiens,,/path/to/genome.fasta,/path/to/annotation.gff3,[/path/to/reads.fq.gz]
+Gorilla_gorilla,,/path/to/genome.fasta,/path/to/annotation.gff3,[/path/to/reads.fq.gz]
+Pan_paniscus,,/path/to/genome.fasta,/path/to/annotation.gff3,[/path/to/reads.fq.gz]
 ```
 
-Or to Refseq IDs of your species:
+When running on ``--genome_only`` mode, you can leave the **gff** field empty. Otherwise, this field will be ignored.
+
+Additionally, you can run the pipeline using the Refseq IDs of your species:
 
 ```csv
 species,refseq,fasta,gff,fastq
-Pongo_abelii,GCF_028885655.2,,,/path/to/reads.fq.gz
-Macaca_mulatta,GCF_003339765.1,,,/path/to/reads.fq.gz
+Pongo_abelii,GCF_028885655.2,,,[/path/to/reads.fq.gz]
+Macaca_mulatta,GCF_003339765.1,,,[/path/to/reads.fq.gz]
 ```
 
+The **fastq** field is optional. Supply sequencing reads if you intend to run merqury using the `--run_merqury`. Otherwise, this filed will be ignored.
+
 You can mix the two input types **(in development)**.
 
 Each row represents a species, with its associated genome, gff or Refseq ID (to autodownload the genome + gff).
 
 You can run the pipeline using test profiles or example input samplesheets. To run a test set with a samplesheet containing reads:
 
 ```
-nextflow run main.nf -resume -profile docker,test --outdir results
-```
-
-If you supply sequencing reads in your samplesheet, you can still disable merqury by using the parameter `--merqury_skip true`. Alternatively, use a different test profile that does _not_ contain sequencing reads:
-
-```
-nextflow run main.nf -resume -profile docker,test_nofastq --outdir results
+nextflow run main.nf -resume -profile docker,test --outdir results --run_merqury
 ```
 
-To run this pipeline on an example samplesheet included in the repo assets:
+To run this pipeline on an example samplesheet included in the repo assets (_does not include reads_):
 
 ```
 nextflow run main.nf -resume -profile docker --input assets/samplesheet.csv --outdir results

diff --git a/subworkflows/local/genome_and_annotation.nf b/subworkflows/local/genome_and_annotation.nf
@@ -5,7 +5,6 @@ include { BUSCO_BUSCO                         } from '../../modules/nf-core/busc
 include { QUAST                               } from '../../modules/nf-core/quast/main'
 include { AGAT_SPSTATISTICS                   } from '../../modules/nf-core/agat/spstatistics/main'
 include { PLOT_BUSCO_IDEOGRAM                 } from '../../modules/local/plot_busco_ideogram.nf'
-//include { GFFREAD                             } from '../../modules/nf-core/gffread/main'
 include { GFFREAD                             } from '../../modules/local/gffread'
 include { ORTHOFINDER                         } from '../../modules/nf-core/orthofinder/main'
 
@@ -35,14 +34,13 @@ workflow GENOME_AND_ANNOTATION {
 
     // Combine inputs (fasta and gff from AGAT) into a single multichannel so that they
     // keep in sync
-    ch_fasta
-        | combine(ch_gff_agat, by:0) // by:0 | Only combine when both channels share the same id
-        | multiMap {
-            meta, fasta, gff ->
-                fasta : fasta ? tuple( meta, file(fasta) ) : null
-                gff   : gff   ? tuple( meta, file(gff) )   : null
-        }
-        | set { ch_input }
+    ch_input = ch_fasta
+                | combine(ch_gff_agat, by:0) // by:0 | Only combine when both channels share the same id
+                | multiMap {
+                    meta, fasta, gff ->
+                        fasta : fasta ? tuple( meta, file(fasta) ) : null // channel: [ val(meta), [ fasta ] ]
+                        gff   : gff   ? tuple( meta, file(gff) )   : null // channel: [ val(meta), [ gff ] ]
+                }
 
     //
     // Run AGAT Spstatistics

diff --git a/subworkflows/local/utils_nfcore_genomeqc_pipeline/main.nf b/subworkflows/local/utils_nfcore_genomeqc_pipeline/main.nf
@@ -149,9 +149,11 @@ def validateInputParameters() {
 def validateInputSamplesheet(input) {
     def (meta, refseq, fasta, gff, fastq) = input
     if (params.run_merqury && !fastq) { // Perhaps this should be on validateInputParameters()
-        error("You are runnning using --run_merqury flag but no fastq was found")
+        error("You are runnning genomeqc using --run_merqury, but no fastq file was found")
     }
-    // As for now, there are only two input options: RefSeq ID or local files. The pipeline will throw an error if the sample sheet does not contain the proper information
+    // As for now, there are only two input options: RefSeq ID or local files.
+    // The pipeline will throw an error if the sample sheet does not contain
+    // the proper information
     // If --genome_only parameter
     // Check for genome-only mode
     if (params.genome_only) {
@@ -160,15 +162,15 @@ def validateInputSamplesheet(input) {
         } else if (meta && !refseq && fasta) {
             return [meta, fasta, gff, fastq] // Empty or not gff, either way won't be used
         } else {
-            error("You are running in --genome_only mode. Please check input samplesheet -> Incorrect samplesheet format")
+            error("You are running genomeqc in --genome_only mode, but no fasta file or RefSeq ID was found. Please check input samplesheet -> Incorrect samplesheet format")
         }
     } else {
         if (meta && refseq && !fasta && !gff) {
             return [ meta, refseq, fastq ]
         } else if ( meta && !refseq && fasta && gff ) {
             return [ meta, fasta, gff, fastq ]
         } else {
-            error("You are running on default mode. Please check input samplesheet -> Incorrent samplesheet format")
+            error("You are running genomeqc on default mode. Please check input samplesheet -> Incorrent samplesheet format")
         }
     }
 }

diff --git a/workflows/genomeqc.nf b/workflows/genomeqc.nf
@@ -36,17 +36,15 @@ workflow GENOMEQC {
 
     ch_versions = Channel.empty()
     ch_multiqc_files = Channel.empty()
-    ch_tree_data = Channel.empty()
-
-    ch_samplesheet
-        | map {
-            validateInputSamplesheet(it) // Input validation (check local subworkflow)
-        }
-        | branch {
-            ncbi  : it.size() == 3
-            local : it.size() == 4
-        }
-        | set { ch_input }
+
+    ch_input = ch_samplesheet
+                | map {
+                    validateInputSamplesheet(it) // Input validation (check local subworkflow)
+                }
+                | branch {
+                    ncbi  : it.size() == 3
+                    local : it.size() == 4
+                }
 
     // MODULE: Run create_path
 
@@ -58,13 +56,12 @@ workflow GENOMEQC {
 
     // For NCBIGENOMEDOWNLOAD
 
-    CREATE_PATH.out.accession
-        | multiMap {
-            meta, accession ->
-                meta      : meta
-                accession : accession
-        }
-        | set { ch_ncbi_input }
+    ch_ncbi_input = CREATE_PATH.out.accession
+                    | multiMap {
+                        meta, accession ->
+                            meta      : meta
+                            accession : accession
+                    }
 
     //
     // MODULE: Run ncbigenomedownlaod for RefSeq IDs
@@ -79,27 +76,23 @@ workflow GENOMEQC {
     ch_versions = ch_versions.mix(NCBIGENOMEDOWNLOAD.out.versions.first())
 
     //
-    // Define gff and fasta channels
+    // Perpare input channels
     //
-
-    //fasta = NCBIGENOMEDOWNLOAD.out.fna.mix( ch_input.local.map { meta, fasta, gff, fq -> tuple( meta, file(fasta) ) } )
-    //gff   = NCBIGENOMEDOWNLOAD.out.gff.mix( ch_input.local.map { meta, fasta, gff, fq -> tuple( meta, file(gff) ) } )
 
     // gff. We use mix() here becuase when local files are present,
     // then RefSeq IDs should be missing, and viceversa
-    ch_input.local
-        | map { meta, fasta, gff, fq -> tuple( meta, file(fasta) ) }
-        | mix ( NCBIGENOMEDOWNLOAD.out.fna )
-        | set { fasta }
+    fasta = ch_input.local
+            | map { meta, fasta, gff, fq -> tuple( meta, file(fasta) ) }
+            | mix ( NCBIGENOMEDOWNLOAD.out.fna )
+
     // fasta. We use mix() here becuase when local files are present, then RefSeq IDs should be missing, and viceversa
-    ch_input.local
-        | map { meta, fasta, gff, fq -> tuple( meta, file(gff) ) }
-        | mix ( NCBIGENOMEDOWNLOAD.out.gff )
-        | set { gff }
+    gff   = ch_input.local
+            | map { meta, fasta, gff, fq -> tuple( meta, file(gff) ) }
+            | mix ( NCBIGENOMEDOWNLOAD.out.gff )
 
 
     // Filter fasta files by extension and create channels for each file type
-    gz_fasta = fasta.filter { it[1].name.endsWith(".gz") }
+    gz_fasta     = fasta.filter { it[1].name.endsWith(".gz") }
     non_gz_fasta = fasta.filter { !it[1].name.endsWith(".gz") }
 
     // Run module uncompress_fasta
@@ -109,7 +102,7 @@ workflow GENOMEQC {
 
     // Filter gff files by extension and create channels for each file type
 
-    gz_gff = gff.filter { it[1].name.endsWith(".gz") }
+    gz_gff     = gff.filter { it[1].name.endsWith(".gz") }
     non_gz_gff = gff.filter { !it[1].name.endsWith(".gz") }
 
     // Run module uncompress_GFF
@@ -120,7 +113,7 @@ workflow GENOMEQC {
     // Combine the channels back together so that all the uncompressed files are in channels 
 
     ch_fasta  = UNCOMPRESS_FASTA.out.file.mix(non_gz_fasta)
-    ch_gff = UNCOMPRESS_GFF.out.file.mix(non_gz_gff)
+    ch_gff    = UNCOMPRESS_GFF.out.file.mix(non_gz_gff)
 
     //
     // Define fastq input channel
@@ -129,10 +122,9 @@ workflow GENOMEQC {
     // FASTQ file is optional in the samplesheet. 
     // First, get it like you do for gff and fasta
 
-    ch_input.ncbi
-        | map{ meta, refseq, fq -> tuple( meta, fq ) }
-        | mix( ch_input.local.map{ meta, fasta, gff, fq -> tuple( meta, fq ) } )
-        | set { ch_fastq }
+    ch_fastq = ch_input.ncbi
+                | map{ meta, refseq, fq -> tuple( meta, fq ) }
+                | mix( ch_input.local.map{ meta, fasta, gff, fq -> tuple( meta, fq ) } )
 
     // Then, check to see that element 1 is not empty, and if not, make it file()
     // You have to do this because if you pass in file() in the initial map, 
@@ -149,16 +141,15 @@ workflow GENOMEQC {
 
     // Combine both fasta, gff and fastq channels into a single multi-channel object using multiMap, so that they are in sync all the time
     // If element (fasta, gff, fq) is empty, it will return an empty (null) channel
-    ch_fasta
-        | combine(ch_gff, by:0) // by:0 | Only combine when both channels share the same id
-        | combine(ch_fastq, by:0)
-        | multiMap {
-            meta, fasta, gff, fq ->
-                fasta : fasta ? tuple( meta, file(fasta) ) : null // Not sure if conditional is necessary anymore
-                gff   : gff   ? tuple( meta, file(gff) )   : null
-                fq    : fq    ? tuple( meta, file(fq) )    : null
-        }
-        | set { ch_input }
+    ch_input = ch_fasta
+                | combine(ch_gff, by:0) // by:0 | Only combine when both channels share the same id
+                | combine(ch_fastq, by:0)
+                | multiMap {
+                    meta, fasta, gff, fq ->
+                        fasta : fasta ? tuple( meta, file(fasta) ) : null // Not sure if conditional is necessary anymore
+                        gff   : gff   ? tuple( meta, file(gff) )   : null
+                        fq    : fq    ? tuple( meta, file(fq) )    : null
+                }
 
     //
     // Run TIDK
@@ -188,11 +179,10 @@ workflow GENOMEQC {
             params.kvalue
         )
         ch_meryl_union = MERYL_UNIONSUM.out.meryl_db
-        ch_versions = ch_versions.mix(MERYL_UNIONSUM.out.versions.first())
+        ch_versions    = ch_versions.mix(MERYL_UNIONSUM.out.versions.first())
         // MODULE: MERQURY_MERQURY
-        ch_meryl_union
-            | join(ch_input.fasta)
-            | set {ch_merqury_inputs}
+        ch_merqury_inputs = ch_meryl_union.join(ch_input.fasta)
+
         MERQURY_MERQURY ( ch_merqury_inputs )
         ch_merqury_qv                           = MERQURY_MERQURY.out.assembly_qv
         ch_merqury_stats                        = MERQURY_MERQURY.out.stats