nf-core · JoseEspinosa · May 23, 2024 · May 15, 2024 · May 21, 2024 · May 21, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -39,9 +39,6 @@ jobs:
         uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
       - name: Run pipeline with test data
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
-        # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
 
@@ -68,8 +65,31 @@ jobs:
         uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
       - name: Run pipeline with test data
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
-        # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results
+
+  test_offline:
+    name: Run ortholog fetching with offline databases
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "23.04.0"
+          - "latest-everything"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v2
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Disk space cleanup
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+
+      - name: Run pipeline with test data
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_offline,docker --outdir ./results
diff --git a/bin/make_hits_table.py b/bin/make_hits_table.py
@@ -20,6 +20,10 @@ def main() -> None:
         reader = csv.DictReader(f)
         data = list(reader)
 
+    if not data:
+        print("id")
+        return
+
     sample_id = sys.argv[2]
 
     # Get list of databases

diff --git a/bin/make_score_table.py b/bin/make_score_table.py
@@ -21,6 +21,9 @@ def main() -> None:
         reader = csv.reader(f)
         data = list(reader)
 
+    if not data:
+        return
+
     # Get the header and the data
     header = data[0]
     data = data[1:]

diff --git a/bin/make_stats.py b/bin/make_stats.py
@@ -15,7 +15,10 @@ def make_stats(score_table: str) -> None:
     max_score = 0
     with open(score_table) as f:
         reader = csv.reader(f)
-        header = next(reader) # skip header
+        try:
+            header = next(reader) # skip header
+        except StopIteration:
+            return
         max_score = len(header) - 3
         scores = [float(row[-1]) for row in reader]
 

diff --git a/bin/oma2uniprot_local.py b/bin/oma2uniprot_local.py
@@ -7,10 +7,13 @@
 import sys
 
 
-def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None:
+def oma2uniprot_local(ids_path: str, idmap_path: str) -> None:
     """
     Map a list of OMA IDs to UniProt IDs using a local ID mapping file.
     """
+    with open(ids_path) as f:
+        oma_ids = f.read().splitlines()
+
     mapping = dict()
     with gzip.open(idmap_path, "rt") as f:
         for line in f:
@@ -27,9 +30,9 @@ def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None:
 
 def main() -> None:
     if len(sys.argv) < 3:
-        raise ValueError("Too few arguments. Usage: oma2uniprot_local.py <idmap_path> <ids>")
+        raise ValueError("Too few arguments. Usage: oma2uniprot_local.py <idmap_path> <ids_path>")
 
-    oma2uniprot_local(sys.argv[2:], sys.argv[1])
+    oma2uniprot_local(sys.argv[2], sys.argv[1])
 
 
 if __name__ == "__main__":

diff --git a/bin/plot_orthologs.R b/bin/plot_orthologs.R
@@ -33,7 +33,21 @@ customize_theme <- function(font_size, text_color, bg_color) {
 theme_dark <- customize_theme(font_size, text_color_darkmode, bg_color)
 theme_light <- customize_theme(font_size, text_color_lightmode, bg_color)
 # Load the data
-data <- read.csv(args[1], header = TRUE, stringsAsFactors = FALSE)
+fallback_plot <- function() {
+    ggplot() +
+        theme_minimal() +
+        theme(panel.grid = element_blank(), axis.text = element_text(color = "transparent"), legend.position = "none")
+}
+empty_plots <- function(e) {
+    ggsave(paste0(args[2], "_supports_dark.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300)
+    ggsave(paste0(args[2], "_supports_light.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300)
+    ggsave(paste0(args[2], "_venn_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
+    ggsave(paste0(args[2], "_venn_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
+    ggsave(paste0(args[2], "_jaccard_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
+    ggsave(paste0(args[2], "_jaccard_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
+    quit(save = "no", status = 0)
+}
+data <- tryCatch(read.csv(args[1], header = TRUE, stringsAsFactors = FALSE), error = empty_plots)
 
 # Melt the data keeping ID and score
 melted_data <- melt(data, id.vars = c("id", "id_format", "score"), variable.name = "method", value.name = "support") %>%

diff --git a/bin/score_hits.py b/bin/score_hits.py
@@ -62,6 +62,10 @@ def main():
 
     # load data
     data = load_data_from_csv(sys.argv[1])
+
+    if not data:
+        return
+
     prefix = sys.argv[2]
     with open(sys.argv[3]) as f:
         query = f.read().strip()

diff --git a/bin/uniprotize_oma_local.py b/bin/uniprotize_oma_local.py
@@ -29,7 +29,7 @@ def uniprotize_oma(oma_ids_path: str, ensembl_idmap_path: str, refseq_idmap_path
         for line in f:
             items = line.split()
             if items[0] not in refseq_mapping and "_" not in items[1]:
-                refseq_mapping[items[0]] = items[1]
+                refseq_mapping[items[0]] = items[1].split(";")[0]
 
     refseq_ids_mapped = [refseq_mapping[i] for i in ensembl_ids_unmapped if i in refseq_mapping]
     refseq_ids_unmapped = [i for i in ensembl_ids_unmapped if i not in refseq_mapping]

diff --git a/bin/yml2csv.py b/bin/yml2csv.py
@@ -20,6 +20,11 @@ def main() -> None:
     with open(input_file) as f:
         data = yaml.safe_load(f)
 
+    if not data:
+        with open(output_file, "w") as f:
+            print("id,percent_max,percent_privates,goodness", file=f)
+        return
+
     with open(output_file, "w") as f:
         print("id,percent_max,percent_privates,goodness", file=f)
         print(f"{sample_id},{data['percent_max']},{data['percent_privates']},{data['goodness']}", file=f)

diff --git a/conf/test_offline.config b/conf/test_offline.config
@@ -0,0 +1,38 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/reportho -profile test_offline,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'
+
+    // Other parameters
+    offline_run       = true
+    local_databases   = true
+    oma_path          = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-mini.txt.gz"
+    oma_uniprot_path  = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-uniprot-mini.txt.gz"
+    oma_ensembl_path  = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-ensembl-mini.txt.gz"
+    oma_refseq_path   = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-refseq-mini.txt.gz"
+    panther_path      = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/AllOrthologs-mini.txt"
+    eggnog_path       = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/1_members-mini.tsv.gz"
+    eggnog_idmap_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/latest.Eukaryota-mini.tsv.gz"
+    min_score         = 2
+    skip_downstream   = true
+}
+
diff --git a/docs/usage.md b/docs/usage.md
@@ -85,6 +85,26 @@ outdir: './results/'
 
 You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
 
+### Database snapshots
+
+If you want to use local database copies for the run, you must provide the required files using the appropriate params. See the parameter documentation for details. Below you can find a list of files to provide, as named by the FTP service of the respective databases.
+
+| Parameter           | File name                 |
+| ------------------- | ------------------------- |
+| `oma_path`          | `oma-groups.txt.gz`       |
+| `oma_uniprot_path`  | `oma-uniprot.txt.gz`      |
+| `oma_ensembl_path`  | `oma-ensembl.txt.gz`      |
+| `oma_refseq_path`   | `oma-refseq.txt.gz`       |
+| `panther_path`      | `AllOrthologs.txt`        |
+| `eggnog_path`       | `1_members.tsv.gz`        |
+| `eggnog_idmap_path` | `latest.Eukaryota.tsv.gz` |
+
+### Running offline
+
+With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. You can check `test_offline.config` to see the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures.
+
+While those options allow the pipeline to run its steps offline, the pipeline requires certain configuration files and container images that are downloaded from the internet. If you wish to run the pipeline on a machine without a connection, you can pre-download the required files with `nf-core download`. See [the nf-core tools documentation](https://nf-co.re/docs/nf-core-tools/pipelines/download) for details.
+
 ### Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:

diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf
@@ -15,7 +15,10 @@ process FETCH_EGGNOG_GROUP_LOCAL {
     input:
     tuple val(meta), path(uniprot_id), path(taxid), path(exact)
     path db
-    path idmap
+    path eggnog_idmap
+    path ensembl_idmap
+    path refseq_idmap
+    val offline_run
 
     output:
     tuple val(meta), path("*_eggnog_group.csv"), emit: eggnog_group
@@ -27,9 +30,10 @@ process FETCH_EGGNOG_GROUP_LOCAL {
     script:
     prefix = task.ext.prefix ?: meta.id
     """
-    uniprotid=\$(zcat $idmap | grep \$(cat $uniprot_id) | cut -f2)
-    zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt
-    uniprotize_oma_online.py ${prefix}_eggnog_group_raw.txt > ${prefix}_eggnog_group.txt
+    uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2 | cut -d',' -f1)
+    zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt || test -f ${prefix}_eggnog_group_raw.txt
+    uniprotize_oma_local.py ${prefix}_eggnog_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_eggnog_group.txt
+    touch ${prefix}_eggnog_group.txt
     csv_adorn.py ${prefix}_eggnog_group.txt EggNOG > ${prefix}_eggnog_group.csv
 
     cat <<- END_VERSIONS > versions.yml

diff --git a/modules/local/fetch_oma_group_local.nf b/modules/local/fetch_oma_group_local.nf
@@ -30,8 +30,8 @@ process FETCH_OMA_GROUP_LOCAL {
     prefix = task.ext.prefix ?: meta.id
     """
     omaid=\$(uniprot2oma_local.py $uniprot_idmap $uniprot_id)
-    omagroup=\$(zcat $db | grep \$omaid | head -1 | cut -f3-)
-    oma2uniprot_local.py $uniprot_idmap \$omagroup > ${prefix}_oma_group_raw.txt
+    zcat $db | grep \$omaid | head -1 | cut -f3- > ${prefix}_oma_group_oma.txt || test -f ${prefix}_oma_group_oma.txt
+    oma2uniprot_local.py $uniprot_idmap ${prefix}_oma_group_oma.txt > ${prefix}_oma_group_raw.txt
     uniprotize_oma_local.py ${prefix}_oma_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_oma_group.txt
     csv_adorn.py ${prefix}_oma_group.txt OMA > ${prefix}_oma_group.csv
 

diff --git a/modules/local/fetch_panther_group_local.nf b/modules/local/fetch_panther_group_local.nf
@@ -27,7 +27,7 @@ process FETCH_PANTHER_GROUP_LOCAL {
     prefix = task.ext.prefix ?: meta.id
     """
     id=\$(cat ${uniprot_id})
-    grep \$id AllOrthologs.txt | tr '|' ' ' | tr '\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt
+    grep \$id $panther_db | tr '|' ' ' | tr '\\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt || test -f ${prefix}_panther_group_raw.txt
     csv_adorn.py ${prefix}_panther_group_raw.txt PANTHER > ${prefix}_panther_group.csv
 
     cat <<- END_VERSIONS > versions.yml

diff --git a/modules/local/filter_hits.nf b/modules/local/filter_hits.nf
@@ -21,11 +21,13 @@ process FILTER_HITS {
     task.ext.when == null || task.ext.when
 
     script:
-    prefix = task.ext.prefix ?: meta.id
-    filter = use_centroid ? "cat ${prefix}_centroid.txt" : "cat ${prefix}_minscore_${min_score}.txt"
+    prefix     = task.ext.prefix ?: meta.id
+    targetfile = use_centroid ? "${prefix}_centroid.txt" : "${prefix}_minscore_${min_score}.txt"
     """
     score_hits.py $score_table $prefix $queryid
-    $filter > ${prefix}_filtered_hits.txt
+    touch $targetfile
+    touch ${prefix}_centroid.txt
+    cat $targetfile > ${prefix}_filtered_hits.txt
 
     cat <<- END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/write_seqinfo.nf b/modules/local/write_seqinfo.nf
@@ -9,6 +9,7 @@ process WRITE_SEQINFO {
 
     input:
     tuple val(meta), val(uniprot_id)
+    val offline_run
 
     output:
     tuple val(meta), path("*_id.txt"), path("*_taxid.txt"), path("*_exact.txt") , emit: seqinfo
@@ -19,10 +20,11 @@ process WRITE_SEQINFO {
 
     script:
     prefix = task.ext.prefix ?: meta.id
+    tax_command = offline_run ? "echo 'UNKNOWN' > ${prefix}_taxid.txt" : "fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt"
     """
     echo "${uniprot_id}" > ${prefix}_id.txt
     echo "true" > ${prefix}_exact.txt
-    fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt
+    $tax_command
 
     cat <<- END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -21,6 +21,7 @@ params {
 
     // Ortholog options
     use_all                    = false
+    offline_run                = false
     local_databases            = false
     skip_oma                   = false
     oma_path                   = null
@@ -199,9 +200,10 @@ profiles {
         executor.cpus           = 4
         executor.memory         = 8.GB
     }
-    test       { includeConfig 'conf/test.config'       }
-    test_fasta { includeConfig 'conf/test_fasta.config' }
-    test_full  { includeConfig 'conf/test_full.config'  }
+    test         { includeConfig 'conf/test.config'         }
+    test_fasta   { includeConfig 'conf/test_fasta.config'   }
+    test_full    { includeConfig 'conf/test_full.config'    }
+    test_offline { includeConfig 'conf/test_offline.config' }
 }
 
 // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -63,6 +63,13 @@
                     "help_text": "If set to `true`, the pipeline will use local databases for the analysis.",
                     "fa_icon": "fas fa-database"
                 },
+                "offline_run": {
+                    "type": "boolean",
+                    "default": "false",
+                    "description": "Run the pipeline in offline mode. Overrides all online database flags.",
+                    "help_text": "If set to `true`, the pipeline will run in offline mode. `local_databases` must be set separately.",
+                    "fa_icon": "fas fa-database"
+                },
                 "skip_oma": {
                     "type": "boolean",
                     "default": "false",