Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added offline run flag and profile #29

Merged
merged 16 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ jobs:
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1

- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results

Expand All @@ -68,8 +65,31 @@ jobs:
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1

- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results

test_offline:
name: Run ortholog fetching with offline databases
# Only run on push if this is the nf-core dev branch (merged PRs)
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}"
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "23.04.0"
- "latest-everything"
steps:
- name: Check out pipeline code
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4

- name: Install Nextflow
uses: nf-core/setup-nextflow@v2
with:
version: "${{ matrix.NXF_VER }}"

- name: Disk space cleanup
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1

- name: Run pipeline with test data
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_offline,docker --outdir ./results
4 changes: 4 additions & 0 deletions bin/make_hits_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def main() -> None:
reader = csv.DictReader(f)
data = list(reader)

if not data:
print("id")
return

sample_id = sys.argv[2]

# Get list of databases
Expand Down
3 changes: 3 additions & 0 deletions bin/make_score_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def main() -> None:
reader = csv.reader(f)
data = list(reader)

if not data:
return

# Get the header and the data
header = data[0]
data = data[1:]
Expand Down
5 changes: 4 additions & 1 deletion bin/make_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def make_stats(score_table: str) -> None:
max_score = 0
with open(score_table) as f:
reader = csv.reader(f)
header = next(reader) # skip header
try:
header = next(reader) # skip header
except StopIteration:
return
max_score = len(header) - 3
scores = [float(row[-1]) for row in reader]

Expand Down
9 changes: 6 additions & 3 deletions bin/oma2uniprot_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
import sys


def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None:
def oma2uniprot_local(ids_path: str, idmap_path: str) -> None:
"""
Map a list of OMA IDs to UniProt IDs using a local ID mapping file.
"""
with open(ids_path) as f:
oma_ids = f.read().splitlines()

mapping = dict()
with gzip.open(idmap_path, "rt") as f:
for line in f:
Expand All @@ -27,9 +30,9 @@ def oma2uniprot_local(oma_ids: list[str], idmap_path: str) -> None:

def main() -> None:
if len(sys.argv) < 3:
raise ValueError("Too few arguments. Usage: oma2uniprot_local.py <idmap_path> <ids>")
raise ValueError("Too few arguments. Usage: oma2uniprot_local.py <idmap_path> <ids_path>")

oma2uniprot_local(sys.argv[2:], sys.argv[1])
oma2uniprot_local(sys.argv[2], sys.argv[1])


if __name__ == "__main__":
Expand Down
16 changes: 15 additions & 1 deletion bin/plot_orthologs.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,21 @@ customize_theme <- function(font_size, text_color, bg_color) {
theme_dark <- customize_theme(font_size, text_color_darkmode, bg_color)
theme_light <- customize_theme(font_size, text_color_lightmode, bg_color)
# Load the data
data <- read.csv(args[1], header = TRUE, stringsAsFactors = FALSE)
fallback_plot <- function() {
ggplot() +
theme_minimal() +
theme(panel.grid = element_blank(), axis.text = element_text(color = "transparent"), legend.position = "none")
}
empty_plots <- function(e) {
ggsave(paste0(args[2], "_supports_dark.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300)
ggsave(paste0(args[2], "_supports_light.png"), plot = fallback_plot(), width = 6, height = 10, dpi = 300)
ggsave(paste0(args[2], "_venn_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
ggsave(paste0(args[2], "_venn_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
ggsave(paste0(args[2], "_jaccard_dark.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
ggsave(paste0(args[2], "_jaccard_light.png"), plot = fallback_plot(), width = 6, height = 6, dpi = 300)
quit(save = "no", status = 0)
}
data <- tryCatch(read.csv(args[1], header = TRUE, stringsAsFactors = FALSE), error = empty_plots)

# Melt the data keeping ID and score
melted_data <- melt(data, id.vars = c("id", "id_format", "score"), variable.name = "method", value.name = "support") %>%
Expand Down
4 changes: 4 additions & 0 deletions bin/score_hits.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ def main():

# load data
data = load_data_from_csv(sys.argv[1])

if not data:
return

prefix = sys.argv[2]
with open(sys.argv[3]) as f:
query = f.read().strip()
Expand Down
2 changes: 1 addition & 1 deletion bin/uniprotize_oma_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def uniprotize_oma(oma_ids_path: str, ensembl_idmap_path: str, refseq_idmap_path
for line in f:
items = line.split()
if items[0] not in refseq_mapping and "_" not in items[1]:
refseq_mapping[items[0]] = items[1]
refseq_mapping[items[0]] = items[1].split(";")[0]

refseq_ids_mapped = [refseq_mapping[i] for i in ensembl_ids_unmapped if i in refseq_mapping]
refseq_ids_unmapped = [i for i in ensembl_ids_unmapped if i not in refseq_mapping]
Expand Down
5 changes: 5 additions & 0 deletions bin/yml2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ def main() -> None:
with open(input_file) as f:
data = yaml.safe_load(f)

if not data:
with open(output_file, "w") as f:
print("id,percent_max,percent_privates,goodness", file=f)
return

with open(output_file, "w") as f:
print("id,percent_max,percent_privates,goodness", file=f)
print(f"{sample_id},{data['percent_max']},{data['percent_privates']},{data['goodness']}", file=f)
Expand Down
38 changes: 38 additions & 0 deletions conf/test_offline.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/reportho -profile test_offline,<docker/singularity> --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'

// Other parameters
offline_run = true
local_databases = true
oma_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-mini.txt.gz"
oma_uniprot_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-uniprot-mini.txt.gz"
oma_ensembl_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-ensembl-mini.txt.gz"
oma_refseq_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/oma-refseq-mini.txt.gz"
panther_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/AllOrthologs-mini.txt"
eggnog_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/1_members-mini.tsv.gz"
eggnog_idmap_path = "https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/databases/latest.Eukaryota-mini.tsv.gz"
min_score = 2
skip_downstream = true
}

20 changes: 20 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,26 @@ outdir: './results/'

You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).

### Database snapshots

If you want to use local database copies for the run, you must provide the required files using the appropriate params. See the parameter documentation for details. Below you can find a list of files to provide, as named by the FTP service of the respective databases.

| Parameter | File name |
| ------------------- | ------------------------- |
| `oma_path` | `oma-groups.txt.gz` |
| `oma_uniprot_path` | `oma-uniprot.txt.gz` |
| `oma_ensembl_path` | `oma-ensembl.txt.gz` |
| `oma_refseq_path` | `oma-refseq.txt.gz` |
| `panther_path` | `AllOrthologs.txt` |
| `eggnog_path` | `1_members.tsv.gz` |
| `eggnog_idmap_path` | `latest.Eukaryota.tsv.gz` |

### Running offline

With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. You can check `test_offline.config` to see the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures.

While those options allow the pipeline to run its steps offline, the pipeline requires certain configuration files and container images that are downloaded from the internet. If you wish to run the pipeline on a machine without a connection, you can pre-download the required files with `nf-core download`. See [the nf-core tools documentation](https://nf-co.re/docs/nf-core-tools/pipelines/download) for details.

### Updating the pipeline

When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
Expand Down
12 changes: 8 additions & 4 deletions modules/local/fetch_eggnog_group_local.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ process FETCH_EGGNOG_GROUP_LOCAL {
input:
tuple val(meta), path(uniprot_id), path(taxid), path(exact)
path db
path idmap
path eggnog_idmap
path ensembl_idmap
path refseq_idmap
val offline_run

output:
tuple val(meta), path("*_eggnog_group.csv"), emit: eggnog_group
Expand All @@ -27,9 +30,10 @@ process FETCH_EGGNOG_GROUP_LOCAL {
script:
prefix = task.ext.prefix ?: meta.id
"""
uniprotid=\$(zcat $idmap | grep \$(cat $uniprot_id) | cut -f2)
zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt
uniprotize_oma_online.py ${prefix}_eggnog_group_raw.txt > ${prefix}_eggnog_group.txt
uniprotid=\$(zcat $eggnog_idmap | grep \$(cat $uniprot_id) | cut -f2 | cut -d',' -f1)
zcat $db | grep \$uniprotid | cut -f 5 | tr ',' '\\n' | awk -F'.' '{ print \$2 }' > ${prefix}_eggnog_group_raw.txt || test -f ${prefix}_eggnog_group_raw.txt
uniprotize_oma_local.py ${prefix}_eggnog_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_eggnog_group.txt
touch ${prefix}_eggnog_group.txt
csv_adorn.py ${prefix}_eggnog_group.txt EggNOG > ${prefix}_eggnog_group.csv

cat <<- END_VERSIONS > versions.yml
Expand Down
4 changes: 2 additions & 2 deletions modules/local/fetch_oma_group_local.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ process FETCH_OMA_GROUP_LOCAL {
prefix = task.ext.prefix ?: meta.id
"""
omaid=\$(uniprot2oma_local.py $uniprot_idmap $uniprot_id)
omagroup=\$(zcat $db | grep \$omaid | head -1 | cut -f3-)
oma2uniprot_local.py $uniprot_idmap \$omagroup > ${prefix}_oma_group_raw.txt
zcat $db | grep \$omaid | head -1 | cut -f3- > ${prefix}_oma_group_oma.txt || test -f ${prefix}_oma_group_oma.txt
oma2uniprot_local.py $uniprot_idmap ${prefix}_oma_group_oma.txt > ${prefix}_oma_group_raw.txt
uniprotize_oma_local.py ${prefix}_oma_group_raw.txt $ensembl_idmap $refseq_idmap > ${prefix}_oma_group.txt
csv_adorn.py ${prefix}_oma_group.txt OMA > ${prefix}_oma_group.csv

Expand Down
2 changes: 1 addition & 1 deletion modules/local/fetch_panther_group_local.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ process FETCH_PANTHER_GROUP_LOCAL {
prefix = task.ext.prefix ?: meta.id
"""
id=\$(cat ${uniprot_id})
grep \$id AllOrthologs.txt | tr '|' ' ' | tr '\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt
grep \$id $panther_db | tr '|' ' ' | tr '\\t' ' ' | cut -d' ' -f3,6 | awk -v id="\$id" -F'UniProtKB=' '{ for(i=0;i<=NF;i++) { if(\$i !~ id) s=s ? s OFS \$i : \$i } print s; s="" }' > ${prefix}_panther_group_raw.txt || test -f ${prefix}_panther_group_raw.txt
csv_adorn.py ${prefix}_panther_group_raw.txt PANTHER > ${prefix}_panther_group.csv

cat <<- END_VERSIONS > versions.yml
Expand Down
8 changes: 5 additions & 3 deletions modules/local/filter_hits.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ process FILTER_HITS {
task.ext.when == null || task.ext.when

script:
prefix = task.ext.prefix ?: meta.id
filter = use_centroid ? "cat ${prefix}_centroid.txt" : "cat ${prefix}_minscore_${min_score}.txt"
prefix = task.ext.prefix ?: meta.id
targetfile = use_centroid ? "${prefix}_centroid.txt" : "${prefix}_minscore_${min_score}.txt"
"""
score_hits.py $score_table $prefix $queryid
$filter > ${prefix}_filtered_hits.txt
touch $targetfile
touch ${prefix}_centroid.txt
cat $targetfile > ${prefix}_filtered_hits.txt

cat <<- END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 3 additions & 1 deletion modules/local/write_seqinfo.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ process WRITE_SEQINFO {

input:
tuple val(meta), val(uniprot_id)
val offline_run

output:
tuple val(meta), path("*_id.txt"), path("*_taxid.txt"), path("*_exact.txt") , emit: seqinfo
Expand All @@ -19,10 +20,11 @@ process WRITE_SEQINFO {

script:
prefix = task.ext.prefix ?: meta.id
tax_command = offline_run ? "echo 'UNKNOWN' > ${prefix}_taxid.txt" : "fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt"
"""
echo "${uniprot_id}" > ${prefix}_id.txt
echo "true" > ${prefix}_exact.txt
fetch_oma_taxid_by_id.py $uniprot_id > ${prefix}_taxid.txt
$tax_command

cat <<- END_VERSIONS > versions.yml
"${task.process}":
Expand Down
8 changes: 5 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ params {

// Ortholog options
use_all = false
offline_run = false
local_databases = false
skip_oma = false
oma_path = null
Expand Down Expand Up @@ -199,9 +200,10 @@ profiles {
executor.cpus = 4
executor.memory = 8.GB
}
test { includeConfig 'conf/test.config' }
test_fasta { includeConfig 'conf/test_fasta.config' }
test_full { includeConfig 'conf/test_full.config' }
test { includeConfig 'conf/test.config' }
test_fasta { includeConfig 'conf/test_fasta.config' }
test_full { includeConfig 'conf/test_full.config' }
test_offline { includeConfig 'conf/test_offline.config' }
}

// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile
Expand Down
7 changes: 7 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@
"help_text": "If set to `true`, the pipeline will use local databases for the analysis.",
"fa_icon": "fas fa-database"
},
"offline_run": {
"type": "boolean",
"default": "false",
"description": "Run the pipeline in offline mode. Overrides all online database flags.",
"help_text": "If set to `true`, the pipeline will run in offline mode. `local_databases` must be set separately.",
"fa_icon": "fas fa-database"
},
"skip_oma": {
"type": "boolean",
"default": "false",
Expand Down
Loading