From 66472313890859e256a98de36ca67277acdd16b7 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 15 May 2024 15:49:35 +0200 Subject: [PATCH 01/16] Added offline run flag and profile --- conf/offline.config | 15 +++++++++++++++ modules/local/fetch_eggnog_group_local.nf | 2 +- modules/local/write_seqinfo.nf | 4 +++- nextflow.config | 2 ++ nextflow_schema.json | 7 +++++++ subworkflows/local/get_orthologs.nf | 15 ++++++++++++++- 6 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 conf/offline.config diff --git a/conf/offline.config b/conf/offline.config new file mode 100644 index 0000000..8224ab7 --- /dev/null +++ b/conf/offline.config @@ -0,0 +1,15 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for offline run. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + config_profile_name = 'Offline' + config_profile_description = 'Settings for offline run' + + // Other parameters + offline_run = true + local_databases = true + skip_downstream = true +} diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf index c1786f8..b350cff 100644 --- a/modules/local/fetch_eggnog_group_local.nf +++ b/modules/local/fetch_eggnog_group_local.nf @@ -28,7 +28,7 @@ process FETCH_EGGNOG_GROUP_LOCAL { prefix = task.ext.prefix ?: meta.id """ uniprotid=\$(zcat $idmap | grep \$(cat $uniprot_id) | cut -f2) - zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt + zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt uniprotize_oma_online.py ${prefix}_eggnog_group_raw.txt > ${prefix}_eggnog_group.txt csv_adorn.py ${prefix}_eggnog_group.txt EggNOG > ${prefix}_eggnog_group.csv diff --git a/modules/local/write_seqinfo.nf b/modules/local/write_seqinfo.nf index 66e1a23..31f2aed 100644 --- a/modules/local/write_seqinfo.nf +++ b/modules/local/write_seqinfo.nf @@ -9,6 +9,7 @@ process WRITE_SEQINFO { input: tuple val(meta), val(uniprot_id) + val offline_run output: tuple val(meta), path("*_id.txt"), path("*_taxid.txt"), path("*_exact.txt") , emit: seqinfo @@ -19,10 +20,11 @@ process WRITE_SEQINFO { script: prefix = task.ext.prefix ?: meta.id + tax_command = offline_run ? "echo 'UNKNOWN' > ${prefix}_taxid.txt" : "fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt" """ echo "${uniprot_id}" > ${prefix}_id.txt echo "true" > ${prefix}_exact.txt - fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt + $tax_command cat <<- END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 805568f..f1c47e2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ params { // Ortholog options use_all = false + offline_run = false local_databases = false skip_oma = false oma_path = null @@ -202,6 +203,7 @@ profiles { test { includeConfig 'conf/test.config' } test_fasta { includeConfig 'conf/test_fasta.config' } test_full { includeConfig 'conf/test_full.config' } + offline { includeConfig 'conf/offline.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 4c22f00..82303a4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -63,6 +63,13 @@ "help_text": "If set to `true`, the pipeline will use local databases for the analysis.", "fa_icon": "fas fa-database" }, + "offline_run": { + "type": "boolean", + "default": "false", + "description": "Run the pipeline in offline mode. Overrides all online database flags.", + "help_text": "If set to `true`, the pipeline will run in offline mode. `local_databases` must be set separately.", + "fa_icon": "fas fa-database" + }, "skip_oma": { "type": "boolean", "default": "false", diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index 6634aaf..ea88357 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -28,6 +28,14 @@ workflow GET_ORTHOLOGS { ch_versions = Channel.empty() ch_orthogroups = Channel.empty() + fasta_input = false + ch_samplesheet_fasta.ifEmpty { + fasta_input = true + } + if (fasta_input && params.offline_run) { + error "Offline run is currently not supported with fasta files as input." + } + // Preprocessing - find the ID and taxid of the query sequences ch_samplesheet_fasta .map { it -> [it[0], file(it[1])] } @@ -41,7 +49,8 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(IDENTIFY_SEQ_ONLINE.out.versions) WRITE_SEQINFO ( - ch_samplesheet_query + ch_samplesheet_query, + params.offline_run ) ch_query = IDENTIFY_SEQ_ONLINE.out.seqinfo.mix(WRITE_SEQINFO.out.seqinfo) @@ -49,6 +58,10 @@ workflow GET_ORTHOLOGS { // Ortholog fetching + if(params.use_all && params.offline_run) { + warning("Trying to use online databases in offline mode. Are you sure?") // TODO: make a warning + } + if(params.use_all) { // OMA if (params.local_databases) { From a3b5a695e155cb8b7e439df18fdfcb90498bc90f Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 21 May 2024 12:09:01 +0200 Subject: [PATCH 02/16] Added info on offline runs to usage.md --- docs/usage.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 1b1ce30..5614c67 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -85,6 +85,26 @@ outdir: './results/' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Database snapshots + +If you want to use local database copies for the run, you must provide the required files using the appropriate params. See the parameter documentation for details. Below you can find a list of files to provide, as named by the FTP service of the respective databases. + +| Parameter | File name | +| ------------------- | ------------------------- | +| `oma_path` | `oma-groups.txt.gz` | +| `oma_uniprot_path` | `oma-uniprot.txt.gz` | +| `oma_ensembl_path` | `oma-ensembl.txt.gz` | +| `oma_refseq_path` | `oma-refseq.txt.gz` | +| `panther_path` | `AllOrthologs.txt` | +| `eggnog_path` | `1_members.tsv.gz` | +| `eggnog_idmap_path` | `latest.Eukaryota.tsv.gz` | + +### Running offline + +With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. For your convenience, there is an `offline` profile provided that sets all the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures. + +While those options allow the pipeline to run its steps offline, the pipeline requires certain configuration files and container images that are downloaded from the internet. If you wish to run the pipeline on a machine without a connection, you can pre-download the required files with `nf-core download`. See [the nf-core tools documentation](https://nf-co.re/docs/nf-core-tools/pipelines/download) for details. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: From 8f53b7dc19534c10b4e868ae2a6203527caae3e3 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 21 May 2024 15:23:06 +0200 Subject: [PATCH 03/16] Tweaks related to EggNOG --- modules/local/fetch_eggnog_group_local.nf | 10 +++++++--- subworkflows/local/get_orthologs.nf | 20 ++++++++++++++------ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf index b350cff..443acbd 100644 --- a/modules/local/fetch_eggnog_group_local.nf +++ b/modules/local/fetch_eggnog_group_local.nf @@ -15,7 +15,10 @@ process FETCH_EGGNOG_GROUP_LOCAL { input: tuple val(meta), path(uniprot_id), path(taxid), path(exact) path db - path idmap + path eggnog_idmap + path ensembl_idmap + path refseq_idmap + val offline_run output: tuple val(meta), path("*_eggnog_group.csv"), emit: eggnog_group @@ -27,9 +30,10 @@ process FETCH_EGGNOG_GROUP_LOCAL { script: prefix = task.ext.prefix ?: meta.id """ - uniprotid=\$(zcat $idmap | grep \$(cat $uniprot_id) | cut -f2) + uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2) zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt - uniprotize_oma_online.py ${prefix}_eggnog_group_raw.txt > ${prefix}_eggnog_group.txt + uniprotize_oma_local.py ${prefix}_eggnog_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_eggnog_group.txt + touch ${prefix}_eggnog_group.txt csv_adorn.py ${prefix}_eggnog_group.txt EggNOG > ${prefix}_eggnog_group.csv cat <<- END_VERSIONS > versions.yml diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index ea88357..a0a1dba 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -28,12 +28,12 @@ workflow GET_ORTHOLOGS { ch_versions = Channel.empty() ch_orthogroups = Channel.empty() - fasta_input = false + fasta_input = true ch_samplesheet_fasta.ifEmpty { - fasta_input = true + fasta_input = false } if (fasta_input && params.offline_run) { - error "Offline run is currently not supported with fasta files as input." + log.warn("You are using FASTA input in an offline run. Online identification will be used. Be aware it might cause rate limit issues.") } // Preprocessing - find the ID and taxid of the query sequences @@ -41,6 +41,8 @@ workflow GET_ORTHOLOGS { .map { it -> [it[0], file(it[1])] } .set { ch_fasta } + ch_fasta.view() + IDENTIFY_SEQ_ONLINE ( ch_fasta ) @@ -59,7 +61,7 @@ workflow GET_ORTHOLOGS { // Ortholog fetching if(params.use_all && params.offline_run) { - warning("Trying to use online databases in offline mode. Are you sure?") // TODO: make a warning + log.warn("Trying to use online databases in offline mode. Are you sure?") } if(params.use_all) { @@ -128,7 +130,10 @@ workflow GET_ORTHOLOGS { FETCH_EGGNOG_GROUP_LOCAL ( ch_query, params.eggnog_path, - params.eggnog_idmap_path + params.eggnog_idmap_path, + params.oma_ensembl_path, + params.oma_refseq_path, + params.offline_run ) ch_orthogroups @@ -173,7 +178,10 @@ workflow GET_ORTHOLOGS { FETCH_EGGNOG_GROUP_LOCAL ( ch_query, params.eggnog_path, - params.eggnog_idmap_path + params.eggnog_idmap_path, + params.oma_ensembl_path, + params.oma_refseq_path, + params.offline_run ) ch_orthogroups From 6920dd37dd08cb37795bf6cb3b32f0ce9c733a0d Mon Sep 17 00:00:00 2001 From: Igor Trujnara <53370556+itrujnara@users.noreply.github.com> Date: Tue, 21 May 2024 15:34:05 +0200 Subject: [PATCH 04/16] Update subworkflows/local/get_orthologs.nf Co-authored-by: Jose Espinosa-Carrasco --- subworkflows/local/get_orthologs.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index a0a1dba..aaf84eb 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -60,7 +60,7 @@ workflow GET_ORTHOLOGS { // Ortholog fetching - if(params.use_all && params.offline_run) { + warning("Both '--use_all' and '--offline_run' parameters have been specified!\nThose databases that can't be run offline will be run online.") log.warn("Trying to use online databases in offline mode. Are you sure?") } From a27bbc610aaf16a2140f725c83bae6d8d5e3d09e Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 21 May 2024 15:35:04 +0200 Subject: [PATCH 05/16] Small fix in use_all warning --- subworkflows/local/get_orthologs.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index aaf84eb..d5ecf30 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -59,8 +59,8 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(WRITE_SEQINFO.out.versions) // Ortholog fetching - - warning("Both '--use_all' and '--offline_run' parameters have been specified!\nThose databases that can't be run offline will be run online.") + if(params.offline_run && params.use_all) { + log.warn("Both '--use_all' and '--offline_run' parameters have been specified!\nThose databases that can't be run offline will be run online.") log.warn("Trying to use online databases in offline mode. Are you sure?") } From cb3d7682e2529e996327136c7b835cec6c208d01 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Tue, 21 May 2024 15:35:28 +0200 Subject: [PATCH 06/16] Another tweak for doubled warning --- subworkflows/local/get_orthologs.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index d5ecf30..6caf02a 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -61,7 +61,6 @@ workflow GET_ORTHOLOGS { // Ortholog fetching if(params.offline_run && params.use_all) { log.warn("Both '--use_all' and '--offline_run' parameters have been specified!\nThose databases that can't be run offline will be run online.") - log.warn("Trying to use online databases in offline mode. Are you sure?") } if(params.use_all) { From 5d86656bea25e3eb25df3a06ba369e01c6f8e2f0 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 11:13:56 +0200 Subject: [PATCH 07/16] Noticed something about the refseq ID map --- bin/uniprotize_oma_local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/uniprotize_oma_local.py b/bin/uniprotize_oma_local.py index 3e12da9..f628839 100755 --- a/bin/uniprotize_oma_local.py +++ b/bin/uniprotize_oma_local.py @@ -29,7 +29,7 @@ def uniprotize_oma(oma_ids_path: str, ensembl_idmap_path: str, refseq_idmap_path for line in f: items = line.split() if items[0] not in refseq_mapping and "_" not in items[1]: - refseq_mapping[items[0]] = items[1] + refseq_mapping[items[0]] = items[1].split(";")[0] refseq_ids_mapped = [refseq_mapping[i] for i in ensembl_ids_unmapped if i in refseq_mapping] refseq_ids_unmapped = [i for i in ensembl_ids_unmapped if i not in refseq_mapping] From b10c128d3158c7814bdcd8cb068eab892df52d76 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 11:32:02 +0200 Subject: [PATCH 08/16] Tweak for EggNOG ID map format --- modules/local/fetch_eggnog_group_local.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf index 443acbd..768d6f8 100644 --- a/modules/local/fetch_eggnog_group_local.nf +++ b/modules/local/fetch_eggnog_group_local.nf @@ -30,7 +30,7 @@ process FETCH_EGGNOG_GROUP_LOCAL { script: prefix = task.ext.prefix ?: meta.id """ - uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2) + uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2 | cut -d',' -f1) zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt uniprotize_oma_local.py ${prefix}_eggnog_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_eggnog_group.txt touch ${prefix}_eggnog_group.txt From c5b1e6c0a9ca510ad5d3ca7a11d7faf781392f07 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 12:16:39 +0200 Subject: [PATCH 09/16] Bug fixes --- modules/local/fetch_eggnog_group_local.nf | 2 +- modules/local/fetch_panther_group_local.nf | 2 +- subworkflows/local/get_orthologs.nf | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf index 768d6f8..baa6702 100644 --- a/modules/local/fetch_eggnog_group_local.nf +++ b/modules/local/fetch_eggnog_group_local.nf @@ -31,7 +31,7 @@ process FETCH_EGGNOG_GROUP_LOCAL { prefix = task.ext.prefix ?: meta.id """ uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2 | cut -d',' -f1) - zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt + zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt || test -f ${prefix}_eggnog_group_raw.txt uniprotize_oma_local.py ${prefix}_eggnog_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_eggnog_group.txt touch ${prefix}_eggnog_group.txt csv_adorn.py ${prefix}_eggnog_group.txt EggNOG > ${prefix}_eggnog_group.csv diff --git a/modules/local/fetch_panther_group_local.nf b/modules/local/fetch_panther_group_local.nf index 60d4979..f823666 100644 --- a/modules/local/fetch_panther_group_local.nf +++ b/modules/local/fetch_panther_group_local.nf @@ -27,7 +27,7 @@ process FETCH_PANTHER_GROUP_LOCAL { prefix = task.ext.prefix ?: meta.id """ id=\$(cat ${uniprot_id}) - grep \$id AllOrthologs.txt | tr '|' ' ' | tr '\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt + grep \$id $panther_db | tr '|' ' ' | tr '\\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt || test -f ${prefix}_panther_group_raw.txt csv_adorn.py ${prefix}_panther_group_raw.txt PANTHER > ${prefix}_panther_group.csv cat <<- END_VERSIONS > versions.yml diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index 6caf02a..22c0472 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -32,6 +32,7 @@ workflow GET_ORTHOLOGS { ch_samplesheet_fasta.ifEmpty { fasta_input = false } + ch_samplesheet_fasta.view() if (fasta_input && params.offline_run) { log.warn("You are using FASTA input in an offline run. Online identification will be used. Be aware it might cause rate limit issues.") } From 2cf571b84fb242e1599a144033bf0e56d01dfbc2 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 12:16:50 +0200 Subject: [PATCH 10/16] Added offline test profile --- conf/test_offline.config | 38 ++++++++++++++++++++++++++++++++++++++ nextflow.config | 9 +++++---- 2 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 conf/test_offline.config diff --git a/conf/test_offline.config b/conf/test_offline.config new file mode 100644 index 0000000..ac794e6 --- /dev/null +++ b/conf/test_offline.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/reportho -profile test_offline, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet_single.csv' + + // Other parameters + offline_run = true + local_databases = true + oma_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-mini.txt.gz" + oma_uniprot_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-uniprot-mini.txt.gz" + oma_ensembl_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-ensembl-mini.txt.gz" + oma_refseq_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-refseq-mini.txt.gz" + panther_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/AllOrthologs-mini.txt" + eggnog_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/1_members-mini.tsv.gz" + eggnog_idmap_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/latest.Eukaryota-mini.tsv.gz" + min_score = 2 + skip_downstream = true +} + diff --git a/nextflow.config b/nextflow.config index f1c47e2..e48ed51 100644 --- a/nextflow.config +++ b/nextflow.config @@ -200,10 +200,11 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_fasta { includeConfig 'conf/test_fasta.config' } - test_full { includeConfig 'conf/test_full.config' } - offline { includeConfig 'conf/offline.config' } + test { includeConfig 'conf/test.config' } + test_fasta { includeConfig 'conf/test_fasta.config' } + test_full { includeConfig 'conf/test_full.config' } + offline { includeConfig 'conf/offline.config' } + test_offline { includeConfig 'conf/test_offline.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 40d9cc41f3f05240670f995a4a8cf980c6b99fe8 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 12:38:20 +0200 Subject: [PATCH 11/16] Removed offline.config --- conf/offline.config | 15 --------------- docs/usage.md | 2 +- nextflow.config | 1 - 3 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 conf/offline.config diff --git a/conf/offline.config b/conf/offline.config deleted file mode 100644 index 8224ab7..0000000 --- a/conf/offline.config +++ /dev/null @@ -1,15 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for offline run. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -params { - config_profile_name = 'Offline' - config_profile_description = 'Settings for offline run' - - // Other parameters - offline_run = true - local_databases = true - skip_downstream = true -} diff --git a/docs/usage.md b/docs/usage.md index 5614c67..190cd30 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -101,7 +101,7 @@ If you want to use local database copies for the run, you must provide the requi ### Running offline -With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. For your convenience, there is an `offline` profile provided that sets all the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures. +With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. You can check `test_offline.config` to see the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures. While those options allow the pipeline to run its steps offline, the pipeline requires certain configuration files and container images that are downloaded from the internet. If you wish to run the pipeline on a machine without a connection, you can pre-download the required files with `nf-core download`. See [the nf-core tools documentation](https://nf-co.re/docs/nf-core-tools/pipelines/download) for details. diff --git a/nextflow.config b/nextflow.config index e48ed51..ba448fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -203,7 +203,6 @@ profiles { test { includeConfig 'conf/test.config' } test_fasta { includeConfig 'conf/test_fasta.config' } test_full { includeConfig 'conf/test_full.config' } - offline { includeConfig 'conf/offline.config' } test_offline { includeConfig 'conf/test_offline.config' } } From e8549fe25fb87a8eb6224b1b25f1339634de147d Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 14:42:12 +0200 Subject: [PATCH 12/16] Fixed empty output issues in OMA local --- bin/oma2uniprot_local.py | 9 ++++++--- modules/local/fetch_oma_group_local.nf | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bin/oma2uniprot_local.py b/bin/oma2uniprot_local.py index 19c605b..5d1bf8b 100755 --- a/bin/oma2uniprot_local.py +++ b/bin/oma2uniprot_local.py @@ -7,10 +7,13 @@ import sys -def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None: +def oma2uniprot_local(ids_path: str, idmap_path: str) -> None: """ Map a list of OMA IDs to UniProt IDs using a local ID mapping file. """ + with open(ids_path) as f: + oma_ids = f.read().splitlines() + mapping = dict() with gzip.open(idmap_path, "rt") as f: for line in f: @@ -27,9 +30,9 @@ def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None: def main() -> None: if len(sys.argv) < 3: - raise ValueError("Too few arguments. Usage: oma2uniprot_local.py ") + raise ValueError("Too few arguments. Usage: oma2uniprot_local.py ") - oma2uniprot_local(sys.argv[2:], sys.argv[1]) + oma2uniprot_local(sys.argv[2], sys.argv[1]) if __name__ == "__main__": diff --git a/modules/local/fetch_oma_group_local.nf b/modules/local/fetch_oma_group_local.nf index 07a813e..6a0f02f 100644 --- a/modules/local/fetch_oma_group_local.nf +++ b/modules/local/fetch_oma_group_local.nf @@ -30,8 +30,8 @@ process FETCH_OMA_GROUP_LOCAL { prefix = task.ext.prefix ?: meta.id """ omaid=\$(uniprot2oma_local.py $uniprot_idmap $uniprot_id) - omagroup=\$(zcat $db | grep \$omaid | head -1 | cut -f3-) - oma2uniprot_local.py $uniprot_idmap \$omagroup > ${prefix}_oma_group_raw.txt + zcat $db | grep \$omaid | head -1 | cut -f3- > ${prefix}_oma_group_oma.txt || test -f ${prefix}_oma_group_oma.txt + oma2uniprot_local.py $uniprot_idmap ${prefix}_oma_group_oma.txt > ${prefix}_oma_group_raw.txt uniprotize_oma_local.py ${prefix}_oma_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_oma_group.txt csv_adorn.py ${prefix}_oma_group.txt OMA > ${prefix}_oma_group.csv From b44190b18c486be53bb8a0e1815a87222b7d9036 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 15:02:47 +0200 Subject: [PATCH 13/16] More tweaks for empty output --- bin/make_score_table.py | 3 +++ modules/local/filter_hits.nf | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/make_score_table.py b/bin/make_score_table.py index ccea2df..abbce80 100755 --- a/bin/make_score_table.py +++ b/bin/make_score_table.py @@ -21,6 +21,9 @@ def main() -> None: reader = csv.reader(f) data = list(reader) + if not data: + return + # Get the header and the data header = data[0] data = data[1:] diff --git a/modules/local/filter_hits.nf b/modules/local/filter_hits.nf index 5f0d78d..f701d08 100644 --- a/modules/local/filter_hits.nf +++ b/modules/local/filter_hits.nf @@ -21,11 +21,12 @@ process FILTER_HITS { task.ext.when == null || task.ext.when script: - prefix = task.ext.prefix ?: meta.id - filter = use_centroid ? "cat ${prefix}_centroid.txt" : "cat ${prefix}_minscore_${min_score}.txt" + prefix = task.ext.prefix ?: meta.id + targetfile = use_centroid ? "${prefix}_centroid.txt" : "${prefix}_minscore_${min_score}.txt" """ score_hits.py $score_table $prefix $queryid - $filter > ${prefix}_filtered_hits.txt + touch $targetfile + cat $targetfile > ${prefix}_filtered_hits.txt cat <<- END_VERSIONS > versions.yml "${task.process}": From d2d747bc711f5e042fed49c539a167e9381bad35 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 15:03:01 +0200 Subject: [PATCH 14/16] Reverted samplesheet in offline test --- conf/test_offline.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_offline.config b/conf/test_offline.config index ac794e6..7833d17 100644 --- a/conf/test_offline.config +++ b/conf/test_offline.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet_single.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv' // Other parameters offline_run = true From a03639cd7513f42e98d97004255bf521eb2bfce6 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 15:03:12 +0200 Subject: [PATCH 15/16] Added offline test to CI --- .github/workflows/ci.yml | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb18a85..32e5eae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,9 +39,6 @@ jobs: uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results @@ -68,8 +65,31 @@ jobs: uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results + + test_offline: + name: Run ortholog fetching with offline databases + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "23.04.0" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Run pipeline with test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_offline,docker --outdir ./results From 0b22de07731f8bd3d35b26351ad208ef4b8697e4 Mon Sep 17 00:00:00 2001 From: itrujnara Date: Wed, 22 May 2024 15:49:41 +0200 Subject: [PATCH 16/16] More "null safety" tweaks --- bin/make_hits_table.py | 4 ++++ bin/make_stats.py | 5 ++++- bin/plot_orthologs.R | 16 +++++++++++++++- bin/score_hits.py | 4 ++++ bin/yml2csv.py | 5 +++++ modules/local/filter_hits.nf | 1 + 6 files changed, 33 insertions(+), 2 deletions(-) diff --git a/bin/make_hits_table.py b/bin/make_hits_table.py index 116d9af..034c20f 100755 --- a/bin/make_hits_table.py +++ b/bin/make_hits_table.py @@ -20,6 +20,10 @@ def main() -> None: reader = csv.DictReader(f) data = list(reader) + if not data: + print("id") + return + sample_id = sys.argv[2] # Get list of databases diff --git a/bin/make_stats.py b/bin/make_stats.py index 7a0bf26..7287024 100755 --- a/bin/make_stats.py +++ b/bin/make_stats.py @@ -15,7 +15,10 @@ def make_stats(score_table: str) -> None: max_score = 0 with open(score_table) as f: reader = csv.reader(f) - header = next(reader) # skip header + try: + header = next(reader) # skip header + except StopIteration: + return max_score = len(header) - 3 scores = [float(row[-1]) for row in reader] diff --git a/bin/plot_orthologs.R b/bin/plot_orthologs.R index 3adba7c..23c9e30 100755 --- a/bin/plot_orthologs.R +++ b/bin/plot_orthologs.R @@ -33,7 +33,21 @@ customize_theme <- function(font_size, text_color, bg_color) { theme_dark <- customize_theme(font_size, text_color_darkmode, bg_color) theme_light <- customize_theme(font_size, text_color_lightmode, bg_color) # Load the data -data <- read.csv(args[1], header = TRUE, stringsAsFactors = FALSE) +fallback_plot <- function() { + ggplot() + + theme_minimal() + + theme(panel.grid = element_blank(), axis.text = element_text(color = "transparent"), legend.position = "none") +} +empty_plots <- function(e) { + ggsave(paste0(args[2], "_supports_dark.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300) + ggsave(paste0(args[2], "_supports_light.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300) + ggsave(paste0(args[2], "_venn_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + ggsave(paste0(args[2], "_venn_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + ggsave(paste0(args[2], "_jaccard_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + ggsave(paste0(args[2], "_jaccard_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + quit(save = "no", status = 0) +} +data <- tryCatch(read.csv(args[1], header = TRUE, stringsAsFactors = FALSE), error = empty_plots) # Melt the data keeping ID and score melted_data <- melt(data, id.vars = c("id", "id_format", "score"), variable.name = "method", value.name = "support") %>% diff --git a/bin/score_hits.py b/bin/score_hits.py index 7ad39cc..c9a25fd 100755 --- a/bin/score_hits.py +++ b/bin/score_hits.py @@ -62,6 +62,10 @@ def main(): # load data data = load_data_from_csv(sys.argv[1]) + + if not data: + return + prefix = sys.argv[2] with open(sys.argv[3]) as f: query = f.read().strip() diff --git a/bin/yml2csv.py b/bin/yml2csv.py index 27842b8..142ffa8 100755 --- a/bin/yml2csv.py +++ b/bin/yml2csv.py @@ -20,6 +20,11 @@ def main() -> None: with open(input_file) as f: data = yaml.safe_load(f) + if not data: + with open(output_file, "w") as f: + print("id,percent_max,percent_privates,goodness", file=f) + return + with open(output_file, "w") as f: print("id,percent_max,percent_privates,goodness", file=f) print(f"{sample_id},{data['percent_max']},{data['percent_privates']},{data['goodness']}", file=f) diff --git a/modules/local/filter_hits.nf b/modules/local/filter_hits.nf index f701d08..ea1336f 100644 --- a/modules/local/filter_hits.nf +++ b/modules/local/filter_hits.nf @@ -26,6 +26,7 @@ process FILTER_HITS { """ score_hits.py $score_table $prefix $queryid touch $targetfile + touch ${prefix}_centroid.txt cat $targetfile > ${prefix}_filtered_hits.txt cat <<- END_VERSIONS > versions.yml