Merge pull request nf-core#20 from JoseEspinosa/updates

Add validation for fasta input and tests
JoseEspinosa · May 13, 2024 · 8ba2862 · 8ba2862
2 parents 278d34f + cc50269
commit 8ba2862
Show file tree

Hide file tree

Showing 22 changed files with 187 additions and 199 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -44,3 +44,32 @@ jobs:
         # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+
+  test_fasta:
+    name: Run pipeline with test data with fasta files in samplesheet
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "23.04.0"
+          - "latest-everything"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v2
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Disk space cleanup
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+
+      - name: Run pipeline with test data
+        # TODO nf-core: You can customise CI pipeline run tests as required
+        # For example: adding multiple test runs with different parameters
+        # Remember that you can parallelise this by using strategy.matrix
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results
diff --git a/README.md b/README.md
@@ -44,21 +44,20 @@ Steps that follow can be skipped with `--skip_downstream` in batch analysis.
 
 First, prepare a samplesheet with your input data that looks as follows:
 
-`samplesheet.csv`:
-
-```csv
-id,query
+```csv title="samplesheet_fasta.csv"
+id,fasta
 BicD2,data/bicd2.fasta
 ```
 
-or:
+or if you know the UniProt ID of the protein you can provide it directly:
 
-```csv
+```csv title="samplesheet.csv"
 id,query
 BicD2,Q8TD16
 ```
 
-If using the latter format, you must set `--uniprot_query` to true.
+> [!NOTE]
+> If you provide both a FASTA file and a UniProt ID only the latter will be used.
 
 Now, you can run the pipeline using:
 

diff --git a/assets/samplesheet_fasta.csv b/assets/samplesheet_fasta.csv
@@ -0,0 +1,3 @@
+id,fasta
+ste2,https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/sequences/ste2.fa
+ste3,https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/sequences/ste3.fa
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -17,8 +17,22 @@
                 "type": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "A query must be provided"
+            },
+            "fasta": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.fa(sta)?$",
+                "errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'"
             }
+        }
+    },
+    "anyOf": [
+        {
+            "required": ["id", "query"]
         },
-        "required": ["id", "query"]
-    }
+        {
+            "required": ["id", "fasta"]
+        }
+    ]
 }
diff --git a/bin/fetch_oma_by_sequence.py b/bin/fetch_oma_by_sequence.py
@@ -30,9 +30,9 @@ def main() -> None:
 
     # Find the main isoform
     for it in json["targets"]:
-            if it["is_main_isoform"]:
-                entry = it
-                break
+        if it["is_main_isoform"]:
+            entry = it
+            break
 
     # Write exact match status
     if json["identified_by"] == "exact match":

diff --git a/conf/test.config b/conf/test.config
@@ -23,7 +23,6 @@ params {
     input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'
 
     // Other parameters
-    uniprot_query    = true
     skip_eggnog      = true
     min_score        = 3
     skip_iqtree      = true

diff --git a/conf/test_fasta.config b/conf/test_fasta.config
@@ -0,0 +1,31 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/reportho -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet_fasta.csv'
+
+    // Other parameters
+    skip_eggnog      = true
+    min_score        = 3
+    skip_iqtree      = true
+    fastme_bootstrap = 0
+}
+
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -18,7 +18,6 @@ params {
     input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'
 
     // Other parameters
-    uniprot_query     = true
     eggnog_path       = 'http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/1/1_members.tsv.gz'
     eggnog_idmap_path = "http://eggnog5.embl.de/download/eggnog_5.0/id_mappings/uniprot/latest.Eukaryota.tsv.gz"
     min_score         = 3

diff --git a/docs/usage.md b/docs/usage.md
@@ -18,28 +18,29 @@ You will need to create a samplesheet with information about the samples you wou
 
 ### Full samplesheet
 
-The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the table below.
+The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the tables below.
 
-A final samplesheet file may look something like the one below, with `--uniprot_query` enabled:
+A final samplesheet file may look something like the one below:
 
 ```csv title="samplesheet.csv"
 id,query
 BicD2,Q8TD16
 ```
 
-or the one below, otherwise:
+or the one below, if you provide the sequence of the protein in FASTA format:
 
 ```csv title="samplesheet.csv"
-id,query
+id,fasta
 BicD2,/home/myuser/data/bicd2.fa
 ```
 
-| Column  | Description                                                                                                                                                         |
-| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `id`    | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces.                   |
-| `query` | The query of the user-specified type. If `--uniprot_query` is `true`, it should be a valid Uniprot accession. Otherwise, it should be a valid path to a FASTA file. |
+| Column  | Description                                                                                                                                       |
+| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `id`    | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces. |
+| `query` | The query of the user-specified type. It should be a valid Uniprot accession.                                                                     |
+| `fasta` | It should be a valid path to a FASTA file.                                                                                                        |
 
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+An [example Uniprot samplesheet](../assets/samplesheet.csv) and [example FASTA samplesheet](../assets/samplesheet_fasta.csv) has been provided with the pipeline.
 
 ## Running the pipeline
 

diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar
diff --git a/main.nf b/main.nf
@@ -33,15 +33,17 @@ include { PIPELINE_COMPLETION     } from './subworkflows/local/utils_nfcore_repo
 workflow NFCORE_REPORTHO {
 
     take:
-    samplesheet // channel: samplesheet read in from --input
+    samplesheet_query   // channel: samplesheet read in from --input with query
+    samplesheet_fasta   // channel: samplesheet read in from --input with fasta
 
     main:
 
     //
     // WORKFLOW: Run pipeline
     //
     REPORTHO (
-        samplesheet
+        samplesheet_query,
+        samplesheet_fasta,
     )
 
     // emit:
@@ -75,7 +77,8 @@ workflow {
     // WORKFLOW: Run main workflow
     //
     NFCORE_REPORTHO (
-        PIPELINE_INITIALISATION.out.samplesheet
+        PIPELINE_INITIALISATION.out.samplesheet_query,
+        PIPELINE_INITIALISATION.out.samplesheet_fasta,
     )
 
     //

diff --git a/modules/local/dump_params.nf b/modules/local/dump_params.nf
@@ -8,7 +8,6 @@ process DUMP_PARAMS {
 
     input:
     tuple val(meta), path(exact)
-    val uniprot_query
     val use_structures
     val use_centroid
     val min_score
@@ -26,7 +25,6 @@ process DUMP_PARAMS {
     """
     cat <<- END_PARAMS > params.yml
     id: ${meta.id}
-    uniprot_query: ${uniprot_query}
     exact_match: \$(cat $exact)
     use_structures: ${use_structures}
     use_centroid: ${use_centroid}

diff --git a/modules/local/fetch_sequences_online.nf b/modules/local/fetch_sequences_online.nf
@@ -20,8 +20,8 @@ process FETCH_SEQUENCES_ONLINE {
     task.ext.when == null || task.ext.when
 
     script:
-    prefix    = task.ext.prefix ?: meta.id
-    add_query = params.uniprot_query ? "" : "cat $query_fasta >> ${prefix}_orthologs.fa"
+    def prefix = task.ext.prefix ?: meta.id
+    def add_query = query_fasta == [] ? "" : "cat $query_fasta >> ${prefix}_orthologs.fa"
     """
     fetch_sequences.py $ids $prefix > ${prefix}_orthologs.fa
     $add_query

diff --git a/nextflow.config b/nextflow.config
@@ -10,7 +10,6 @@
 params {
     // Input options
     input                      = null
-    uniprot_query              = false
 
     // MultiQC options
     multiqc_config              = null
@@ -200,8 +199,9 @@ profiles {
         executor.cpus           = 4
         executor.memory         = 8.GB
     }
-    test      { includeConfig 'conf/test.config'      }
-    test_full { includeConfig 'conf/test_full.config' }
+    test       { includeConfig 'conf/test.config'       }
+    test_fasta { includeConfig 'conf/test_fasta.config' }
+    test_full  { includeConfig 'conf/test_full.config'  }
 }
 
 // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile
@@ -214,7 +214,7 @@ singularity.registry = 'quay.io'
 
 // Nextflow plugins
 plugins {
-    id 'nf-[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
+    id 'nf-[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
 }
 
 // Export these variables to prevent local Python/R libraries from conflicting with those in the container

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -23,12 +23,6 @@
                     "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/reportho/usage#samplesheet-input).",
                     "fa_icon": "fas fa-file-csv"
                 },
-                "uniprot_query": {
-                    "type": "boolean",
-                    "description": "The input contains a Uniprot ID as query.",
-                    "help_text": "If the input file contains a Uniprot ID as query, set this parameter to `true`.",
-                    "fa_icon": "fas fa-database"
-                },
                 "outdir": {
                     "type": "string",
                     "format": "directory-path",

diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf
@@ -25,9 +25,7 @@ workflow ALIGN {
             ch_for_filter
         )
 
-        ch_versions
-            .mix(FILTER_FASTA.out.versions)
-            .set { ch_versions }
+        ch_versions = ch_versions.mix(FILTER_FASTA.out.versions)
 
         CREATE_TCOFFEETEMPLATE(
             ch_pdb
@@ -52,9 +50,8 @@ workflow ALIGN {
         TCOFFEE_3DALIGN.out.alignment
             .set { ch_alignment }
 
-        ch_versions
-            .mix(TCOFFEE_3DALIGN.out.versions)
-            .set { ch_versions }
+        ch_versions = ch_versions.mix(TCOFFEE_3DALIGN.out.versions)
+
     }
     else {
         TCOFFEE_ALIGN (
@@ -67,9 +64,7 @@ workflow ALIGN {
         TCOFFEE_ALIGN.out.alignment
             .set { ch_alignment }
 
-        ch_versions
-            .mix(TCOFFEE_ALIGN.out.versions)
-            .set { ch_versions }
+        ch_versions = ch_versions.mix(TCOFFEE_ALIGN.out.versions)
     }
 
     emit:

diff --git a/subworkflows/local/fetch_sequences.nf b/subworkflows/local/fetch_sequences.nf
@@ -2,12 +2,14 @@ include { FETCH_SEQUENCES_ONLINE } from "../../modules/local/fetch_sequences_onl
 
 workflow FETCH_SEQUENCES {
     take:
-    ch_idlist
-    ch_query_fasta
+    ch_id_list
+    ch_query
 
     main:
+    ch_id_list
+        .join(ch_query)
+        .set { ch_input }
 
-    ch_input = params.uniprot_query ? ch_idlist.map { it -> [it[0], it[1], []]} : ch_idlist.join(ch_query_fasta)
     FETCH_SEQUENCES_ONLINE (
         ch_input
     )