Skip to content

Commit

Permalink
Merge pull request #88 from Eco-Flow/agat_longest_isoform
Browse files Browse the repository at this point in the history
WIP: Add longest and nf-core GFFREAD modules
  • Loading branch information
FernandoDuarteF authored Nov 20, 2024
2 parents 64a7fdf + e381163 commit 5206882
Show file tree
Hide file tree
Showing 15 changed files with 465 additions and 66 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,20 @@ For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#intr
1. Downloads the genome and gene annotation files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own genomes/annotations
2. Describes genome assembly:
2a. `[BUSCO_BUSCO]`: Determines how complete is the genome compared to expected (protein mode).
2b. `[QUAST]`: Determines the N50, how contiguous the genome is.
2c. More options
2b. `[BUSCO_IDEOGRAM]`: Plots the location of BUSCO markers on the assembly.
2c. `[QUAST]`: Determines the N50, how contiguous the genome is.
2d. More options
3. Describes your annotation : `[AGAT]`: Gene, feature, length, averages, counts.
4. Extract longest protein fasta sequences `[GFFREAD]`.
5. Finds orthologous genes `[ORTHOFINDER]`.
6. Summary with MulitQC.

> [!WARNING]
> We strongly suggest users to specify the lineage using the `--busco_lineage` parameter, as setting the lineage to `auto` (value by default) might cause problems with `[BUSCO]` during the leneage determination step.
> [!NOTE]
> `BUSCO_IDEOGRAM` will only plot those chromosomes -or scaffolds- that contain single copy markers.
**Genome Only (in development):**
1. Downloads the genome files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own genomes
2. Describes genome assembly:
Expand Down
17 changes: 13 additions & 4 deletions bin/plot_busco_ideogram.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Load required libraries
library(optparse)
library(RIdeogram)
library(ggplot2)

# Parse command line arguments
option_list <- list(
Expand All @@ -24,7 +25,7 @@ colnames(busco_mappings) <- c("Status", "Chr", "Start", "End")
karyotype <- read.table(opt$karyotype, header = TRUE, stringsAsFactors = FALSE)

# Process BUSCO mappings
busco_mappings$Type <- "BUSCO_marker"
busco_mappings$Type <- busco_mappings$Status
busco_mappings$Shape <- "circle"

# Change status to color
Expand All @@ -35,6 +36,8 @@ colnames(busco_mappings)[1] <- "color"

busco_mappings <- busco_mappings[, c("Type", "Shape", "Chr", "Start", "End", "color")]

head(busco_mappings)

# Ensure karyotype has required columns
required_columns <- c("Chr", "Start", "End")
if (!all(required_columns %in% colnames(karyotype))) {
Expand All @@ -44,10 +47,16 @@ if (!all(required_columns %in% colnames(karyotype))) {
# Use only required columns from karyotype
karyotype <- karyotype[, required_columns]

# Create a vector with all the chromosomes that contain markers
chr_w_markers = unique(sort(busco_mappings$Chr))

# Plot only those chromosomes in which markers where found
filtered_karyotype = karyotype[karyotype$Chr %in% chr_w_markers, ]

# Generate ideogram
ideogram(karyotype = karyotype, label = busco_mappings, label_type = "marker", output = paste0(opt$prefix, ".svg"))
ideogram(karyotype = filtered_karyotype, label = busco_mappings, label_type = "marker", output = paste0(opt$prefix, ".svg"))

# Convert to png
convertSVG(paste0(opt$prefix, ".svg"), device = "png")
convertSVG(paste0(opt$prefix, ".svg"), file = opt$prefix, device = "png")

cat("Ideogram has been generated and saved as", paste0(opt$prefix, ".svg"), "and", paste0(opt$prefix, ".png"), "\n")
cat("Ideogram has been generated and saved as", paste0(opt$prefix, ".svg"), "and", paste0(opt$prefix, ".png"), "\n")
26 changes: 24 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ process {
]
}


withName: 'FASTAVALIDATOR' {
publishDir = [
path: { "${params.outdir}/fastavalidator" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'ORTHOFINDER' {
publishDir = [
path: { "${params.outdir}/orthofinder" },
Expand Down Expand Up @@ -84,6 +93,11 @@ process {

withName: 'TIDK_EXPLORE' {
ext.args = "--minimum 5 --maximum 12"
publishDir = [
path: { "${params.outdir}/tidk_explore" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'MULTIQC' {
Expand All @@ -103,12 +117,20 @@ process {
]
}

withName: PLOT_BUSCO_IDEOGRAM {
withName: 'PLOT_BUSCO_IDEOGRAM' {
publishDir = [
path: { "${params.outdir}/busco_ideogram" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}


withName: 'LONGEST' {
publishDir = [
path: { "${params.outdir}/longes_isoform" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

}
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"fastavalidator": {
"branch": "master",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"fastqc": {
"branch": "master",
"git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd",
Expand Down
32 changes: 32 additions & 0 deletions modules/local/extract_seqs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process EXTRACT_SEQS {
tag "$meta.id"
label 'process_low'
//label 'process_med_memory'

container = 'community.wave.seqera.io/library/agat:1.4.1--304a47c62ae478b4'

input:
tuple val (meta), path(fasta)
tuple val (meta), path(gff)

output:
tuple val (meta), path( "${meta.id}.prot.fasta" ), emit: prot_fasta
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
# Extract protein from filtered GFF and genome
agat_sp_extract_sequences.pl \\
-g ${gff} \\
-f ${fasta} \\
-p -o ${prefix}.prot.fasta --clean_final_stop
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Perl version: \$(perl --version | grep "version" | sed 's/.*(//g' | sed 's/[)].*//')
END_VERSIONS
"""

}
18 changes: 12 additions & 6 deletions modules/local/longest.nf
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
process LONGEST {
tag "$meta.id"
label 'process_single'
label 'process_med_memory'
label 'process_medium'

container = 'biocontainers/agat:1.3.0--pl5321hdfd78af_0'
container = 'community.wave.seqera.io/library/agat:1.4.1--304a47c62ae478b4'

input:
tuple val (meta), path(gff)

output:
tuple val (meta), path( "${meta.id}.longest.gff3" ), emit: longest_proteins
path "versions.yml", emit: versions
tuple val (meta), path( "${meta.id}.longest.gff3" ) , emit: longest_proteins
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
# Run agat to find longest orf for each gene
agat_sp_keep_longest_isoform.pl -gff ${gff} -o ${prefix}.longest.gff3
agat_sp_keep_longest_isoform.pl \\
-gff ${gff} \\
-o ${prefix}.longest.gff3 \\
$args
md5sum "${prefix}.longest.gff3" > "${prefix}.longest.gff3.md5"
Expand Down
3 changes: 2 additions & 1 deletion modules/local/plot_busco_ideogram.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ process PLOT_BUSCO_IDEOGRAM {

output:
tuple val(genusspeci), val(lineage), path("*.svg"), emit: svg
tuple val(genusspeci), val(lineage), path("*.png"), emit: png
path "versions.yml" , emit: versions

script:
Expand Down Expand Up @@ -39,4 +40,4 @@ process PLOT_BUSCO_IDEOGRAM {
r-rideogram: \$(Rscript -e "cat(as.character(packageVersion('RIdeogram')))")
END_VERSIONS
"""
}
}
7 changes: 7 additions & 0 deletions modules/nf-core/fastavalidator/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::py_fasta_validator=0.6"
62 changes: 62 additions & 0 deletions modules/nf-core/fastavalidator/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
process FASTAVALIDATOR {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/py_fasta_validator:0.6--py37h595c7a6_0':
'biocontainers/py_fasta_validator:0.6--py37h595c7a6_0' }"

input:
tuple val(meta), path(fasta)

output:
tuple val(meta), path('*.success.log') , emit: success_log , optional: true
tuple val(meta), path('*.error.log') , emit: error_log , optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
py_fasta_validator \\
-f $fasta \\
2> "${prefix}.error.log" \\
|| echo "Errors from fasta_validate printed to ${prefix}.error.log"
if [ \$(cat "${prefix}.error.log" | wc -l) -gt 0 ]; then
echo "Validation failed..."
cat \\
"${prefix}.error.log"
else
echo "Validation successful..."
mv \\
"${prefix}.error.log" \\
fasta_validate.stderr
echo "Validation successful..." \\
> "${prefix}.success.log"
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
py_fasta_validator: \$(py_fasta_validator -v | sed 's/.* version //')
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
echo "Validation successful..." \\
> "${prefix}.success.log"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
py_fasta_validator: \$(py_fasta_validator -v | sed 's/.* version //')
END_VERSIONS
"""
}
61 changes: 61 additions & 0 deletions modules/nf-core/fastavalidator/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "fastavalidator"
description: |
"Python C-extension for a simple validator for fasta files. The module emits the validated file or an
error log upon validation failure."
keywords:
- fasta
- validation
- genome
tools:
- fasta_validate:
description: |
"Python C-extension for a simple C code to validate a fasta file. It only checks a few things,
and by default only sets its response via the return code,
so you will need to check that!"
homepage: "https://github.com/linsalrob/py_fasta_validator"
documentation: "https://github.com/linsalrob/py_fasta_validator"
tool_dev_url: "https://github.com/linsalrob/py_fasta_validator"
doi: "10.5281/zenodo.5002710"
licence: ["MIT"]
identifier: ""
input:
- - meta:
type: map
description: |
Groovy Map containing file information
e.g. [ id:'test' ]
- fasta:
type: file
description: Input fasta file
pattern: "*.fasta"
output:
- success_log:
- meta:
type: map
description: |
Groovy Map containing file information
e.g. [ id:'test' ]
- "*.success.log":
type: file
description: Log file for successful validation
pattern: "*.success.log"
- error_log:
- meta:
type: map
description: |
Groovy Map containing file information
e.g. [ id:'test' ]
- "*.error.log":
type: file
description: Log file for failed validation
pattern: "*.error.log"
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@gallvp"
maintainers:
- "@gallvp"
Loading

0 comments on commit 5206882

Please sign in to comment.