Skip to content

Commit

Permalink
Merge pull request nf-core#20 from JoseEspinosa/updates
Browse files Browse the repository at this point in the history
Add validation for fasta input and tests
  • Loading branch information
JoseEspinosa authored May 13, 2024
2 parents 278d34f + cc50269 commit 8ba2862
Show file tree
Hide file tree
Showing 22 changed files with 187 additions and 199 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,32 @@ jobs:
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
test_fasta:
name: Run pipeline with test data with fasta files in samplesheet
# Only run on push if this is the nf-core dev branch (merged PRs)
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}"
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "23.04.0"
- "latest-everything"
steps:
- name: Check out pipeline code
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4

- name: Install Nextflow
uses: nf-core/setup-nextflow@v2
with:
version: "${{ matrix.NXF_VER }}"

- name: Disk space cleanup
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1

- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,20 @@ Steps that follow can be skipped with `--skip_downstream` in batch analysis.
First, prepare a samplesheet with your input data that looks as follows:

`samplesheet.csv`:

```csv
id,query
```csv title="samplesheet_fasta.csv"
id,fasta
BicD2,data/bicd2.fasta
```

or:
or if you know the UniProt ID of the protein you can provide it directly:

```csv
```csv title="samplesheet.csv"
id,query
BicD2,Q8TD16
```

If using the latter format, you must set `--uniprot_query` to true.
> [!NOTE]
> If you provide both a FASTA file and a UniProt ID only the latter will be used.
Now, you can run the pipeline using:

Expand Down
3 changes: 3 additions & 0 deletions assets/samplesheet_fasta.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,fasta
ste2,https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/sequences/ste2.fa
ste3,https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/sequences/ste3.fa
18 changes: 16 additions & 2 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,22 @@
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "A query must be provided"
},
"fasta": {
"type": "string",
"format": "file-path",
"exists": true,
"pattern": "^\\S+\\.fa(sta)?$",
"errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'"
}
}
},
"anyOf": [
{
"required": ["id", "query"]
},
"required": ["id", "query"]
}
{
"required": ["id", "fasta"]
}
]
}
6 changes: 3 additions & 3 deletions bin/fetch_oma_by_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def main() -> None:

# Find the main isoform
for it in json["targets"]:
if it["is_main_isoform"]:
entry = it
break
if it["is_main_isoform"]:
entry = it
break

# Write exact match status
if json["identified_by"] == "exact match":
Expand Down
1 change: 0 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'

// Other parameters
uniprot_query = true
skip_eggnog = true
min_score = 3
skip_iqtree = true
Expand Down
31 changes: 31 additions & 0 deletions conf/test_fasta.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/reportho -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet_fasta.csv'

// Other parameters
skip_eggnog = true
min_score = 3
skip_iqtree = true
fastme_bootstrap = 0
}

1 change: 0 additions & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'

// Other parameters
uniprot_query = true
eggnog_path = 'http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/1/1_members.tsv.gz'
eggnog_idmap_path = "http://eggnog5.embl.de/download/eggnog_5.0/id_mappings/uniprot/latest.Eukaryota.tsv.gz"
min_score = 3
Expand Down
19 changes: 10 additions & 9 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,29 @@ You will need to create a samplesheet with information about the samples you wou

### Full samplesheet

The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the table below.
The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the tables below.

A final samplesheet file may look something like the one below, with `--uniprot_query` enabled:
A final samplesheet file may look something like the one below:

```csv title="samplesheet.csv"
id,query
BicD2,Q8TD16
```

or the one below, otherwise:
or the one below, if you provide the sequence of the protein in FASTA format:

```csv title="samplesheet.csv"
id,query
id,fasta
BicD2,/home/myuser/data/bicd2.fa
```

| Column | Description |
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `id` | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces. |
| `query` | The query of the user-specified type. If `--uniprot_query` is `true`, it should be a valid Uniprot accession. Otherwise, it should be a valid path to a FASTA file. |
| Column | Description |
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| `id` | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces. |
| `query` | The query of the user-specified type. It should be a valid Uniprot accession. |
| `fasta` | It should be a valid path to a FASTA file. |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
An [example Uniprot samplesheet](../assets/samplesheet.csv) and [example FASTA samplesheet](../assets/samplesheet_fasta.csv) has been provided with the pipeline.

## Running the pipeline

Expand Down
Empty file.
9 changes: 6 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,17 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_repo
workflow NFCORE_REPORTHO {

take:
samplesheet // channel: samplesheet read in from --input
samplesheet_query // channel: samplesheet read in from --input with query
samplesheet_fasta // channel: samplesheet read in from --input with fasta

main:

//
// WORKFLOW: Run pipeline
//
REPORTHO (
samplesheet
samplesheet_query,
samplesheet_fasta,
)

// emit:
Expand Down Expand Up @@ -75,7 +77,8 @@ workflow {
// WORKFLOW: Run main workflow
//
NFCORE_REPORTHO (
PIPELINE_INITIALISATION.out.samplesheet
PIPELINE_INITIALISATION.out.samplesheet_query,
PIPELINE_INITIALISATION.out.samplesheet_fasta,
)

//
Expand Down
2 changes: 0 additions & 2 deletions modules/local/dump_params.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ process DUMP_PARAMS {

input:
tuple val(meta), path(exact)
val uniprot_query
val use_structures
val use_centroid
val min_score
Expand All @@ -26,7 +25,6 @@ process DUMP_PARAMS {
"""
cat <<- END_PARAMS > params.yml
id: ${meta.id}
uniprot_query: ${uniprot_query}
exact_match: \$(cat $exact)
use_structures: ${use_structures}
use_centroid: ${use_centroid}
Expand Down
4 changes: 2 additions & 2 deletions modules/local/fetch_sequences_online.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ process FETCH_SEQUENCES_ONLINE {
task.ext.when == null || task.ext.when

script:
prefix = task.ext.prefix ?: meta.id
add_query = params.uniprot_query ? "" : "cat $query_fasta >> ${prefix}_orthologs.fa"
def prefix = task.ext.prefix ?: meta.id
def add_query = query_fasta == [] ? "" : "cat $query_fasta >> ${prefix}_orthologs.fa"
"""
fetch_sequences.py $ids $prefix > ${prefix}_orthologs.fa
$add_query
Expand Down
8 changes: 4 additions & 4 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
params {
// Input options
input = null
uniprot_query = false

// MultiQC options
multiqc_config = null
Expand Down Expand Up @@ -200,8 +199,9 @@ profiles {
executor.cpus = 4
executor.memory = 8.GB
}
test { includeConfig 'conf/test.config' }
test_full { includeConfig 'conf/test_full.config' }
test { includeConfig 'conf/test.config' }
test_fasta { includeConfig 'conf/test_fasta.config' }
test_full { includeConfig 'conf/test_full.config' }
}

// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile
Expand All @@ -214,7 +214,7 @@ singularity.registry = 'quay.io'

// Nextflow plugins
plugins {
id 'nf-[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
id 'nf-[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
}

// Export these variables to prevent local Python/R libraries from conflicting with those in the container
Expand Down
6 changes: 0 additions & 6 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@
"help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/reportho/usage#samplesheet-input).",
"fa_icon": "fas fa-file-csv"
},
"uniprot_query": {
"type": "boolean",
"description": "The input contains a Uniprot ID as query.",
"help_text": "If the input file contains a Uniprot ID as query, set this parameter to `true`.",
"fa_icon": "fas fa-database"
},
"outdir": {
"type": "string",
"format": "directory-path",
Expand Down
13 changes: 4 additions & 9 deletions subworkflows/local/align.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ workflow ALIGN {
ch_for_filter
)

ch_versions
.mix(FILTER_FASTA.out.versions)
.set { ch_versions }
ch_versions = ch_versions.mix(FILTER_FASTA.out.versions)

CREATE_TCOFFEETEMPLATE(
ch_pdb
Expand All @@ -52,9 +50,8 @@ workflow ALIGN {
TCOFFEE_3DALIGN.out.alignment
.set { ch_alignment }

ch_versions
.mix(TCOFFEE_3DALIGN.out.versions)
.set { ch_versions }
ch_versions = ch_versions.mix(TCOFFEE_3DALIGN.out.versions)

}
else {
TCOFFEE_ALIGN (
Expand All @@ -67,9 +64,7 @@ workflow ALIGN {
TCOFFEE_ALIGN.out.alignment
.set { ch_alignment }

ch_versions
.mix(TCOFFEE_ALIGN.out.versions)
.set { ch_versions }
ch_versions = ch_versions.mix(TCOFFEE_ALIGN.out.versions)
}

emit:
Expand Down
8 changes: 5 additions & 3 deletions subworkflows/local/fetch_sequences.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ include { FETCH_SEQUENCES_ONLINE } from "../../modules/local/fetch_sequences_onl

workflow FETCH_SEQUENCES {
take:
ch_idlist
ch_query_fasta
ch_id_list
ch_query

main:
ch_id_list
.join(ch_query)
.set { ch_input }

ch_input = params.uniprot_query ? ch_idlist.map { it -> [it[0], it[1], []]} : ch_idlist.join(ch_query_fasta)
FETCH_SEQUENCES_ONLINE (
ch_input
)
Expand Down
Loading

0 comments on commit 8ba2862

Please sign in to comment.