diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb18a85..32e5eae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,9 +39,6 @@ jobs: uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results @@ -68,8 +65,31 @@ jobs: uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results + + test_offline: + name: Run ortholog fetching with offline databases + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "23.04.0" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Run pipeline with test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_offline,docker --outdir ./results diff --git a/bin/make_hits_table.py b/bin/make_hits_table.py index 116d9af..034c20f 100755 --- a/bin/make_hits_table.py +++ b/bin/make_hits_table.py @@ -20,6 +20,10 @@ def main() -> None: reader = csv.DictReader(f) data = list(reader) + if not data: + print("id") + return + sample_id = sys.argv[2] # Get list of databases diff --git a/bin/make_score_table.py b/bin/make_score_table.py index ccea2df..abbce80 100755 --- a/bin/make_score_table.py +++ b/bin/make_score_table.py @@ -21,6 +21,9 @@ def main() -> None: reader = csv.reader(f) data = list(reader) + if not data: + return + # Get the header and the data header = data[0] data = data[1:] diff --git a/bin/make_stats.py b/bin/make_stats.py index 7a0bf26..7287024 100755 --- a/bin/make_stats.py +++ b/bin/make_stats.py @@ -15,7 +15,10 @@ def make_stats(score_table: str) -> None: max_score = 0 with open(score_table) as f: reader = csv.reader(f) - header = next(reader) # skip header + try: + header = next(reader) # skip header + except StopIteration: + return max_score = len(header) - 3 scores = [float(row[-1]) for row in reader] diff --git a/bin/oma2uniprot_local.py b/bin/oma2uniprot_local.py index 19c605b..5d1bf8b 100755 --- a/bin/oma2uniprot_local.py +++ b/bin/oma2uniprot_local.py @@ -7,10 +7,13 @@ import sys -def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None: +def oma2uniprot_local(ids_path: str, idmap_path: str) -> None: """ Map a list of OMA IDs to UniProt IDs using a local ID mapping file. """ + with open(ids_path) as f: + oma_ids = f.read().splitlines() + mapping = dict() with gzip.open(idmap_path, "rt") as f: for line in f: @@ -27,9 +30,9 @@ def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None: def main() -> None: if len(sys.argv) < 3: - raise ValueError("Too few arguments. Usage: oma2uniprot_local.py ") + raise ValueError("Too few arguments. Usage: oma2uniprot_local.py ") - oma2uniprot_local(sys.argv[2:], sys.argv[1]) + oma2uniprot_local(sys.argv[2], sys.argv[1]) if __name__ == "__main__": diff --git a/bin/plot_orthologs.R b/bin/plot_orthologs.R index 3adba7c..23c9e30 100755 --- a/bin/plot_orthologs.R +++ b/bin/plot_orthologs.R @@ -33,7 +33,21 @@ customize_theme <- function(font_size, text_color, bg_color) { theme_dark <- customize_theme(font_size, text_color_darkmode, bg_color) theme_light <- customize_theme(font_size, text_color_lightmode, bg_color) # Load the data -data <- read.csv(args[1], header = TRUE, stringsAsFactors = FALSE) +fallback_plot <- function() { + ggplot() + + theme_minimal() + + theme(panel.grid = element_blank(), axis.text = element_text(color = "transparent"), legend.position = "none") +} +empty_plots <- function(e) { + ggsave(paste0(args[2], "_supports_dark.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300) + ggsave(paste0(args[2], "_supports_light.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300) + ggsave(paste0(args[2], "_venn_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + ggsave(paste0(args[2], "_venn_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + ggsave(paste0(args[2], "_jaccard_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + ggsave(paste0(args[2], "_jaccard_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300) + quit(save = "no", status = 0) +} +data <- tryCatch(read.csv(args[1], header = TRUE, stringsAsFactors = FALSE), error = empty_plots) # Melt the data keeping ID and score melted_data <- melt(data, id.vars = c("id", "id_format", "score"), variable.name = "method", value.name = "support") %>% diff --git a/bin/score_hits.py b/bin/score_hits.py index 7ad39cc..c9a25fd 100755 --- a/bin/score_hits.py +++ b/bin/score_hits.py @@ -62,6 +62,10 @@ def main(): # load data data = load_data_from_csv(sys.argv[1]) + + if not data: + return + prefix = sys.argv[2] with open(sys.argv[3]) as f: query = f.read().strip() diff --git a/bin/uniprotize_oma_local.py b/bin/uniprotize_oma_local.py index 3e12da9..f628839 100755 --- a/bin/uniprotize_oma_local.py +++ b/bin/uniprotize_oma_local.py @@ -29,7 +29,7 @@ def uniprotize_oma(oma_ids_path: str, ensembl_idmap_path: str, refseq_idmap_path for line in f: items = line.split() if items[0] not in refseq_mapping and "_" not in items[1]: - refseq_mapping[items[0]] = items[1] + refseq_mapping[items[0]] = items[1].split(";")[0] refseq_ids_mapped = [refseq_mapping[i] for i in ensembl_ids_unmapped if i in refseq_mapping] refseq_ids_unmapped = [i for i in ensembl_ids_unmapped if i not in refseq_mapping] diff --git a/bin/yml2csv.py b/bin/yml2csv.py index 27842b8..142ffa8 100755 --- a/bin/yml2csv.py +++ b/bin/yml2csv.py @@ -20,6 +20,11 @@ def main() -> None: with open(input_file) as f: data = yaml.safe_load(f) + if not data: + with open(output_file, "w") as f: + print("id,percent_max,percent_privates,goodness", file=f) + return + with open(output_file, "w") as f: print("id,percent_max,percent_privates,goodness", file=f) print(f"{sample_id},{data['percent_max']},{data['percent_privates']},{data['goodness']}", file=f) diff --git a/conf/test_offline.config b/conf/test_offline.config new file mode 100644 index 0000000..7833d17 --- /dev/null +++ b/conf/test_offline.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/reportho -profile test_offline, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv' + + // Other parameters + offline_run = true + local_databases = true + oma_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-mini.txt.gz" + oma_uniprot_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-uniprot-mini.txt.gz" + oma_ensembl_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-ensembl-mini.txt.gz" + oma_refseq_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-refseq-mini.txt.gz" + panther_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/AllOrthologs-mini.txt" + eggnog_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/1_members-mini.tsv.gz" + eggnog_idmap_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/latest.Eukaryota-mini.tsv.gz" + min_score = 2 + skip_downstream = true +} + diff --git a/docs/usage.md b/docs/usage.md index 1b1ce30..190cd30 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -85,6 +85,26 @@ outdir: './results/' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Database snapshots + +If you want to use local database copies for the run, you must provide the required files using the appropriate params. See the parameter documentation for details. Below you can find a list of files to provide, as named by the FTP service of the respective databases. + +| Parameter | File name | +| ------------------- | ------------------------- | +| `oma_path` | `oma-groups.txt.gz` | +| `oma_uniprot_path` | `oma-uniprot.txt.gz` | +| `oma_ensembl_path` | `oma-ensembl.txt.gz` | +| `oma_refseq_path` | `oma-refseq.txt.gz` | +| `panther_path` | `AllOrthologs.txt` | +| `eggnog_path` | `1_members.tsv.gz` | +| `eggnog_idmap_path` | `latest.Eukaryota.tsv.gz` | + +### Running offline + +With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. You can check `test_offline.config` to see the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures. + +While those options allow the pipeline to run its steps offline, the pipeline requires certain configuration files and container images that are downloaded from the internet. If you wish to run the pipeline on a machine without a connection, you can pre-download the required files with `nf-core download`. See [the nf-core tools documentation](https://nf-co.re/docs/nf-core-tools/pipelines/download) for details. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules/local/fetch_eggnog_group_local.nf b/modules/local/fetch_eggnog_group_local.nf index c1786f8..baa6702 100644 --- a/modules/local/fetch_eggnog_group_local.nf +++ b/modules/local/fetch_eggnog_group_local.nf @@ -15,7 +15,10 @@ process FETCH_EGGNOG_GROUP_LOCAL { input: tuple val(meta), path(uniprot_id), path(taxid), path(exact) path db - path idmap + path eggnog_idmap + path ensembl_idmap + path refseq_idmap + val offline_run output: tuple val(meta), path("*_eggnog_group.csv"), emit: eggnog_group @@ -27,9 +30,10 @@ process FETCH_EGGNOG_GROUP_LOCAL { script: prefix = task.ext.prefix ?: meta.id """ - uniprotid=\$(zcat $idmap | grep \$(cat $uniprot_id) | cut -f2) - zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt - uniprotize_oma_online.py ${prefix}_eggnog_group_raw.txt > ${prefix}_eggnog_group.txt + uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2 | cut -d',' -f1) + zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt || test -f ${prefix}_eggnog_group_raw.txt + uniprotize_oma_local.py ${prefix}_eggnog_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_eggnog_group.txt + touch ${prefix}_eggnog_group.txt csv_adorn.py ${prefix}_eggnog_group.txt EggNOG > ${prefix}_eggnog_group.csv cat <<- END_VERSIONS > versions.yml diff --git a/modules/local/fetch_oma_group_local.nf b/modules/local/fetch_oma_group_local.nf index 07a813e..6a0f02f 100644 --- a/modules/local/fetch_oma_group_local.nf +++ b/modules/local/fetch_oma_group_local.nf @@ -30,8 +30,8 @@ process FETCH_OMA_GROUP_LOCAL { prefix = task.ext.prefix ?: meta.id """ omaid=\$(uniprot2oma_local.py $uniprot_idmap $uniprot_id) - omagroup=\$(zcat $db | grep \$omaid | head -1 | cut -f3-) - oma2uniprot_local.py $uniprot_idmap \$omagroup > ${prefix}_oma_group_raw.txt + zcat $db | grep \$omaid | head -1 | cut -f3- > ${prefix}_oma_group_oma.txt || test -f ${prefix}_oma_group_oma.txt + oma2uniprot_local.py $uniprot_idmap ${prefix}_oma_group_oma.txt > ${prefix}_oma_group_raw.txt uniprotize_oma_local.py ${prefix}_oma_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_oma_group.txt csv_adorn.py ${prefix}_oma_group.txt OMA > ${prefix}_oma_group.csv diff --git a/modules/local/fetch_panther_group_local.nf b/modules/local/fetch_panther_group_local.nf index 60d4979..f823666 100644 --- a/modules/local/fetch_panther_group_local.nf +++ b/modules/local/fetch_panther_group_local.nf @@ -27,7 +27,7 @@ process FETCH_PANTHER_GROUP_LOCAL { prefix = task.ext.prefix ?: meta.id """ id=\$(cat ${uniprot_id}) - grep \$id AllOrthologs.txt | tr '|' ' ' | tr '\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt + grep \$id $panther_db | tr '|' ' ' | tr '\\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt || test -f ${prefix}_panther_group_raw.txt csv_adorn.py ${prefix}_panther_group_raw.txt PANTHER > ${prefix}_panther_group.csv cat <<- END_VERSIONS > versions.yml diff --git a/modules/local/filter_hits.nf b/modules/local/filter_hits.nf index 5f0d78d..ea1336f 100644 --- a/modules/local/filter_hits.nf +++ b/modules/local/filter_hits.nf @@ -21,11 +21,13 @@ process FILTER_HITS { task.ext.when == null || task.ext.when script: - prefix = task.ext.prefix ?: meta.id - filter = use_centroid ? "cat ${prefix}_centroid.txt" : "cat ${prefix}_minscore_${min_score}.txt" + prefix = task.ext.prefix ?: meta.id + targetfile = use_centroid ? "${prefix}_centroid.txt" : "${prefix}_minscore_${min_score}.txt" """ score_hits.py $score_table $prefix $queryid - $filter > ${prefix}_filtered_hits.txt + touch $targetfile + touch ${prefix}_centroid.txt + cat $targetfile > ${prefix}_filtered_hits.txt cat <<- END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/write_seqinfo.nf b/modules/local/write_seqinfo.nf index 66e1a23..31f2aed 100644 --- a/modules/local/write_seqinfo.nf +++ b/modules/local/write_seqinfo.nf @@ -9,6 +9,7 @@ process WRITE_SEQINFO { input: tuple val(meta), val(uniprot_id) + val offline_run output: tuple val(meta), path("*_id.txt"), path("*_taxid.txt"), path("*_exact.txt") , emit: seqinfo @@ -19,10 +20,11 @@ process WRITE_SEQINFO { script: prefix = task.ext.prefix ?: meta.id + tax_command = offline_run ? "echo 'UNKNOWN' > ${prefix}_taxid.txt" : "fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt" """ echo "${uniprot_id}" > ${prefix}_id.txt echo "true" > ${prefix}_exact.txt - fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt + $tax_command cat <<- END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 805568f..ba448fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ params { // Ortholog options use_all = false + offline_run = false local_databases = false skip_oma = false oma_path = null @@ -199,9 +200,10 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_fasta { includeConfig 'conf/test_fasta.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_fasta { includeConfig 'conf/test_fasta.config' } + test_full { includeConfig 'conf/test_full.config' } + test_offline { includeConfig 'conf/test_offline.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 4c22f00..82303a4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -63,6 +63,13 @@ "help_text": "If set to `true`, the pipeline will use local databases for the analysis.", "fa_icon": "fas fa-database" }, + "offline_run": { + "type": "boolean", + "default": "false", + "description": "Run the pipeline in offline mode. Overrides all online database flags.", + "help_text": "If set to `true`, the pipeline will run in offline mode. `local_databases` must be set separately.", + "fa_icon": "fas fa-database" + }, "skip_oma": { "type": "boolean", "default": "false", diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index 6634aaf..22c0472 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -28,11 +28,22 @@ workflow GET_ORTHOLOGS { ch_versions = Channel.empty() ch_orthogroups = Channel.empty() + fasta_input = true + ch_samplesheet_fasta.ifEmpty { + fasta_input = false + } + ch_samplesheet_fasta.view() + if (fasta_input && params.offline_run) { + log.warn("You are using FASTA input in an offline run. Online identification will be used. Be aware it might cause rate limit issues.") + } + // Preprocessing - find the ID and taxid of the query sequences ch_samplesheet_fasta .map { it -> [it[0], file(it[1])] } .set { ch_fasta } + ch_fasta.view() + IDENTIFY_SEQ_ONLINE ( ch_fasta ) @@ -41,13 +52,17 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(IDENTIFY_SEQ_ONLINE.out.versions) WRITE_SEQINFO ( - ch_samplesheet_query + ch_samplesheet_query, + params.offline_run ) ch_query = IDENTIFY_SEQ_ONLINE.out.seqinfo.mix(WRITE_SEQINFO.out.seqinfo) ch_versions = ch_versions.mix(WRITE_SEQINFO.out.versions) // Ortholog fetching + if(params.offline_run && params.use_all) { + log.warn("Both '--use_all' and '--offline_run' parameters have been specified!\nThose databases that can't be run offline will be run online.") + } if(params.use_all) { // OMA @@ -115,7 +130,10 @@ workflow GET_ORTHOLOGS { FETCH_EGGNOG_GROUP_LOCAL ( ch_query, params.eggnog_path, - params.eggnog_idmap_path + params.eggnog_idmap_path, + params.oma_ensembl_path, + params.oma_refseq_path, + params.offline_run ) ch_orthogroups @@ -160,7 +178,10 @@ workflow GET_ORTHOLOGS { FETCH_EGGNOG_GROUP_LOCAL ( ch_query, params.eggnog_path, - params.eggnog_idmap_path + params.eggnog_idmap_path, + params.oma_ensembl_path, + params.oma_refseq_path, + params.offline_run ) ch_orthogroups