Merge pull request #88 from Eco-Flow/agat_longest_isoform

WIP: Add longest and nf-core GFFREAD modules
nf-core · Nov 20, 2024 · 5206882 · 5206882
2 parents 64a7fdf + e381163
commit 5206882
Show file tree

Hide file tree

Showing 15 changed files with 465 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -28,13 +28,20 @@ For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#intr
 1. Downloads the genome and gene annotation files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own genomes/annotations
 2. Describes genome assembly:
 2a. `[BUSCO_BUSCO]`: Determines how complete is the genome compared to expected (protein mode).
-2b. `[QUAST]`: Determines the N50, how contiguous the genome is.
-2c. More options
+2b. `[BUSCO_IDEOGRAM]`: Plots the location of BUSCO markers on the assembly.
+2c. `[QUAST]`: Determines the N50, how contiguous the genome is.
+2d. More options
 3. Describes your annotation : `[AGAT]`: Gene, feature, length, averages, counts. 
 4. Extract longest protein fasta sequences `[GFFREAD]`.
 5. Finds orthologous genes `[ORTHOFINDER]`.
 6. Summary with MulitQC.
 
+> [!WARNING]
+> We strongly suggest users to specify the lineage using the `--busco_lineage` parameter, as setting the lineage to `auto` (value by default) might cause problems with `[BUSCO]` during the leneage determination step.
+
+> [!NOTE]
+> `BUSCO_IDEOGRAM` will only plot those chromosomes -or scaffolds- that contain single copy markers.
+
 **Genome Only (in development):**
 1. Downloads the genome files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own genomes
 2. Describes genome assembly:

diff --git a/bin/plot_busco_ideogram.R b/bin/plot_busco_ideogram.R
@@ -5,6 +5,7 @@
 # Load required libraries
 library(optparse)
 library(RIdeogram)
+library(ggplot2)
 
 # Parse command line arguments
 option_list <- list(
@@ -24,7 +25,7 @@ colnames(busco_mappings) <- c("Status", "Chr", "Start", "End")
 karyotype <- read.table(opt$karyotype, header = TRUE, stringsAsFactors = FALSE)
 
 # Process BUSCO mappings
-busco_mappings$Type <- "BUSCO_marker"
+busco_mappings$Type <- busco_mappings$Status
 busco_mappings$Shape <- "circle" 
 
 # Change status to color
@@ -35,6 +36,8 @@ colnames(busco_mappings)[1] <- "color"
 
 busco_mappings <- busco_mappings[, c("Type", "Shape", "Chr", "Start", "End", "color")]
 
+head(busco_mappings)
+
 # Ensure karyotype has required columns
 required_columns <- c("Chr", "Start", "End")
 if (!all(required_columns %in% colnames(karyotype))) {
@@ -44,10 +47,16 @@ if (!all(required_columns %in% colnames(karyotype))) {
 # Use only required columns from karyotype
 karyotype <- karyotype[, required_columns]
 
+# Create a vector with all the chromosomes that contain markers
+chr_w_markers = unique(sort(busco_mappings$Chr))
+
+# Plot only those chromosomes in which markers where found
+filtered_karyotype = karyotype[karyotype$Chr %in% chr_w_markers, ]
+
 # Generate ideogram
-ideogram(karyotype = karyotype, label = busco_mappings, label_type = "marker", output = paste0(opt$prefix, ".svg"))
+ideogram(karyotype = filtered_karyotype, label = busco_mappings, label_type = "marker", output = paste0(opt$prefix, ".svg"))
 
 # Convert to png
-convertSVG(paste0(opt$prefix, ".svg"), device = "png")
+convertSVG(paste0(opt$prefix, ".svg"), file = opt$prefix, device = "png")
 
-cat("Ideogram has been generated and saved as", paste0(opt$prefix, ".svg"), "and", paste0(opt$prefix, ".png"), "\n")
+cat("Ideogram has been generated and saved as", paste0(opt$prefix, ".svg"), "and", paste0(opt$prefix, ".png"), "\n")
diff --git a/conf/modules.config b/conf/modules.config
@@ -40,6 +40,15 @@ process {
         ]
     }
 
+
+    withName: 'FASTAVALIDATOR' {
+        publishDir = [
+            path: { "${params.outdir}/fastavalidator" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: 'ORTHOFINDER' {
         publishDir = [
             path: { "${params.outdir}/orthofinder" },
@@ -84,6 +93,11 @@ process {
 
     withName: 'TIDK_EXPLORE' {
         ext.args = "--minimum 5 --maximum 12"
+        publishDir = [
+            path: { "${params.outdir}/tidk_explore" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]        
     }
 
     withName: 'MULTIQC' {
@@ -103,12 +117,20 @@ process {
         ]
     }
 
-    withName: PLOT_BUSCO_IDEOGRAM {
+    withName: 'PLOT_BUSCO_IDEOGRAM' {
         publishDir = [
             path: { "${params.outdir}/busco_ideogram" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-
+
+    withName: 'LONGEST' {
+        publishDir = [
+            path: { "${params.outdir}/longes_isoform" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
 }
diff --git a/modules.json b/modules.json
@@ -20,6 +20,11 @@
             "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
             "installed_by": ["modules"]
           },
+          "fastavalidator": {
+            "branch": "master",
+            "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+            "installed_by": ["modules"]
+          },
           "fastqc": {
             "branch": "master",
             "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd",

diff --git a/modules/local/extract_seqs.nf b/modules/local/extract_seqs.nf
@@ -0,0 +1,32 @@
+process EXTRACT_SEQS {
+    tag "$meta.id"
+    label 'process_low'
+    //label 'process_med_memory'
+
+    container = 'community.wave.seqera.io/library/agat:1.4.1--304a47c62ae478b4'
+
+    input:
+    tuple val (meta),  path(fasta)
+    tuple val (meta),  path(gff)
+
+    output:
+    tuple val (meta), path( "${meta.id}.prot.fasta" ), emit: prot_fasta
+    path "versions.yml"                              , emit: versions
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    # Extract protein from filtered GFF and genome
+    agat_sp_extract_sequences.pl \\
+    -g ${gff} \\
+    -f ${fasta} \\
+    -p -o ${prefix}.prot.fasta --clean_final_stop
+    
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        Perl version: \$(perl --version | grep "version" | sed 's/.*(//g' | sed 's/[)].*//')
+    END_VERSIONS
+    """
+
+}
diff --git a/modules/local/longest.nf b/modules/local/longest.nf
@@ -1,22 +1,28 @@
 process LONGEST {
     tag "$meta.id"
-    label 'process_single'
-    label 'process_med_memory'
+    label 'process_medium'
 
-    container = 'biocontainers/agat:1.3.0--pl5321hdfd78af_0'
+    container = 'community.wave.seqera.io/library/agat:1.4.1--304a47c62ae478b4'
 
     input:
     tuple val (meta),  path(gff)
 
     output:
-    tuple val (meta), path( "${meta.id}.longest.gff3" ),                emit: longest_proteins
-    path "versions.yml", emit: versions
+    tuple val (meta), path( "${meta.id}.longest.gff3" ) , emit: longest_proteins
+    path "versions.yml"                                 , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
 
     script:
+    def args = task.ext.args     ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
     # Run agat to find longest orf for each gene 
-    agat_sp_keep_longest_isoform.pl -gff ${gff} -o ${prefix}.longest.gff3
+    agat_sp_keep_longest_isoform.pl \\
+    -gff ${gff} \\
+    -o ${prefix}.longest.gff3 \\
+    $args
     
     md5sum "${prefix}.longest.gff3" > "${prefix}.longest.gff3.md5"
 

diff --git a/modules/local/plot_busco_ideogram.nf b/modules/local/plot_busco_ideogram.nf
@@ -10,6 +10,7 @@ process PLOT_BUSCO_IDEOGRAM {
 
     output:
     tuple val(genusspeci), val(lineage), path("*.svg"), emit: svg
+    tuple val(genusspeci), val(lineage), path("*.png"), emit: png
     path "versions.yml"           , emit: versions
 
     script:
@@ -39,4 +40,4 @@ process PLOT_BUSCO_IDEOGRAM {
         r-rideogram: \$(Rscript -e "cat(as.character(packageVersion('RIdeogram')))")
     END_VERSIONS
     """
-}
+}
diff --git a/modules/nf-core/fastavalidator/environment.yml b/modules/nf-core/fastavalidator/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::py_fasta_validator=0.6"
diff --git a/modules/nf-core/fastavalidator/main.nf b/modules/nf-core/fastavalidator/main.nf
@@ -0,0 +1,62 @@
+process FASTAVALIDATOR {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/py_fasta_validator:0.6--py37h595c7a6_0':
+        'biocontainers/py_fasta_validator:0.6--py37h595c7a6_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+
+    output:
+    tuple val(meta), path('*.success.log')  , emit: success_log , optional: true
+    tuple val(meta), path('*.error.log')    , emit: error_log   , optional: true
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    py_fasta_validator \\
+        -f $fasta \\
+        2> "${prefix}.error.log" \\
+        || echo "Errors from fasta_validate printed to ${prefix}.error.log"
+
+    if [ \$(cat "${prefix}.error.log" | wc -l) -gt 0 ]; then
+        echo "Validation failed..."
+
+        cat \\
+            "${prefix}.error.log"
+    else
+        echo "Validation successful..."
+
+        mv \\
+            "${prefix}.error.log" \\
+            fasta_validate.stderr
+
+        echo "Validation successful..." \\
+            > "${prefix}.success.log"
+    fi
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        py_fasta_validator: \$(py_fasta_validator -v | sed 's/.* version //')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo "Validation successful..." \\
+        > "${prefix}.success.log"
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        py_fasta_validator: \$(py_fasta_validator -v | sed 's/.* version //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/fastavalidator/meta.yml b/modules/nf-core/fastavalidator/meta.yml
@@ -0,0 +1,61 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "fastavalidator"
+description: |
+  "Python C-extension for a simple validator for fasta files. The module emits the validated file or an
+  error log upon validation failure."
+keywords:
+  - fasta
+  - validation
+  - genome
+tools:
+  - fasta_validate:
+      description: |
+        "Python C-extension for a simple C code to validate a fasta file. It only checks a few things,
+        and by default only sets its response via the return code,
+        so you will need to check that!"
+      homepage: "https://github.com/linsalrob/py_fasta_validator"
+      documentation: "https://github.com/linsalrob/py_fasta_validator"
+      tool_dev_url: "https://github.com/linsalrob/py_fasta_validator"
+      doi: "10.5281/zenodo.5002710"
+      licence: ["MIT"]
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing file information
+          e.g. [ id:'test' ]
+    - fasta:
+        type: file
+        description: Input fasta file
+        pattern: "*.fasta"
+output:
+  - success_log:
+      - meta:
+          type: map
+          description: |
+            Groovy Map containing file information
+            e.g. [ id:'test' ]
+      - "*.success.log":
+          type: file
+          description: Log file for successful validation
+          pattern: "*.success.log"
+  - error_log:
+      - meta:
+          type: map
+          description: |
+            Groovy Map containing file information
+            e.g. [ id:'test' ]
+      - "*.error.log":
+          type: file
+          description: Log file for failed validation
+          pattern: "*.error.log"
+  - versions:
+      - versions.yml:
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+authors:
+  - "@gallvp"
+maintainers:
+  - "@gallvp"