Skip to content

Commit

Permalink
Added min_contig_length validation and integrated filtering to main.nf
Browse files Browse the repository at this point in the history
  • Loading branch information
Yasin Kaya committed Dec 12, 2024
1 parent ee702d7 commit af407e6
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 8 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,16 @@ Each row represents an input genome and the fields are:
- `fasta:` fasta file for the genome
- `is_masked`: yes or no to denote whether the fasta file is already masked or not

#### `--min_contig_length`
- **Description**: Minimum length (in base pairs) of contigs to include in the analysis.
- **Default**: 5000
- **Example**:
```bash
nextflow run main.nf --min_contig_length 10000
```
This will exclude all contigs shorter than 10,000 bp from the analysis.


At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using:

```bash
Expand Down
56 changes: 49 additions & 7 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,35 @@ include { GENEPAL } from './workflows/genepal'
include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_genepal_pipeline'
include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_genepal_pipeline'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
PROCESS: Filter Genome Assembly by Minimum Contig Length
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

process SEQKIT_GET_LENGTH {
tag "${meta.id}"
label 'process_medium'
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0'
: 'quay.io/biocontainers/seqkit:2.4.0--h9ee0642_0'}"

input:
tuple val(meta), path(genome_fasta)

output:
tuple val(meta), path("filtered_${meta.id}.fasta"), path("${meta.id}_contig_list.txt"), emit: filtered_fasta

script:
"""
# Filter contigs based on length and output filtered FASTA
seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta
# Generate a list of filtered contigs
seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt
"""
}

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
NAMED WORKFLOWS FOR PIPELINE
Expand Down Expand Up @@ -48,10 +77,15 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {

main:
//
// WORKFLOW: Run pipeline
// Filter genome assembly by minimum contig length
//
SEQKIT_GET_LENGTH(ch_target_assembly)

//
// Run GENEPAL main workflow using filtered FASTA
//
GENEPAL(
ch_target_assembly,
SEQKIT_GET_LENGTH.out.filtered_fasta.map { meta, fasta, contig_list -> [ meta, fasta ] }, // Filtered genome FASTA
ch_tar_assm_str,
ch_is_masked,
ch_te_library,
Expand All @@ -68,9 +102,11 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {
ch_tsebra_config,
ch_orthofinder_pep
)

emit:
multiqc_report = GENEPAL.out.multiqc_report // channel: /path/to/multiqc_report.html
}

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
Expand All @@ -81,9 +117,9 @@ workflow {

main:
//
// SUBWORKFLOW: Run initialisation tasks
// SUBWORKFLOW: Run initialization tasks
//
PIPELINE_INITIALISATION (
PIPELINE_INITIALISATION(
params.version,
params.monochrome_logs,
args,
Expand All @@ -95,10 +131,15 @@ workflow {
)

//
// WORKFLOW: Run main workflow
// Filter genome assembly by minimum contig length
//
SEQKIT_GET_LENGTH(PIPELINE_INITIALISATION.out.target_assembly)

//
// Run main workflow using filtered FASTA
//
PLANTFOODRESEARCHOPEN_GENEPAL(
PIPELINE_INITIALISATION.out.target_assembly,
SEQKIT_GET_LENGTH.out.filtered_fasta,
PIPELINE_INITIALISATION.out.tar_assm_str,
PIPELINE_INITIALISATION.out.is_masked,
PIPELINE_INITIALISATION.out.te_library,
Expand All @@ -115,10 +156,11 @@ workflow {
PIPELINE_INITIALISATION.out.tsebra_config,
PIPELINE_INITIALISATION.out.orthofinder_pep
)

//
// SUBWORKFLOW: Run completion tasks
//
PIPELINE_COMPLETION (
PIPELINE_COMPLETION(
params.email,
params.email_on_fail,
params.plaintext_email,
Expand Down
11 changes: 10 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ params {
orthofinder_annotations = null
outdir = null
email = null
min_contig_length = 5000

// Repeat annotation options
repeat_annotator = 'repeatmodeler'
Expand Down Expand Up @@ -79,7 +80,15 @@ params {
custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"

}

// Validation for the min_contig_length parameter
process {
beforeScript = """
if [[ ${params.min_contig_length} -le 1000 ]]; then
echo "ERROR: The parameter 'min_contig_length' must be greater than 1 kbp (1000 base pairs). Provided value: ${params.min_contig_length}" >&2
exit 1
fi
"""
}
// Max resources
process {
resourceLimits = [
Expand Down
Empty file.

0 comments on commit af407e6

Please sign in to comment.