diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d9afaf2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +resources +resources_test +work +.nextflow* +.vscode +.DS_Store +output +trace-* +.ipynb_checkpoints \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a7ec0e3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "common"] + path = common + url = git@github.com:openproblems-bio/common-resources.git diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2a22864 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,101 @@ +# denoising 0.1.0 + +## BREAKING CHANGES + +* Update to viash 0.9.0 RC6 + +* Directory structure has been updated. + +* Update to viash 0.9.0 (PR #13). + +## NEW FUNCTIONALITY + +* Add `CHANGELOG.md` (PR #7). + +* Update `process_dataset` component to subsample large datasets (PR #14). + +* Add the scPRINT method (PR #25) + +## MAJOR CHANGES + +* Revamp `scripts` directory (PR #13). + +* Relocated `process_datasets` to `data_processors/process_datasets` (PR #13). + +## MINOR CHANGES + +* Remove dtype parameter in `.Anndata()` (PR #6). + +* Fix target_sum deprecation warning in `mse` mmetric (PR #8). + +* Update `task_name` variable to denoising in component scripts (PR #9). + +* Update docker containers used in components (PR #12). + +* Set `numpy<2` for some failing methods (PR #13). + +* Small changes to api file names (PR #13). + +* Update test_resources path in components (PR #18). + +* Update workflows to use core repository dependency (PR #20). + +* Update the `common` submodule (PR #24) + +* Use the common `checkItemAllowed()` for the method check in the benchmark workflow (PR #24) + +* Use the `cxg_immune_cell_atlas` dataset instead of the `cxg_mouse_pancreas_atlas` for testing (PR #24) + +* Update `README` (PR #24) + +* Add a base method API schema (PR #24) + +* Add `dataset_organism` to training input files (PR #24) + +## BUG FIXES + +* Update the nextflow workflow dependencies (PR #17). + +* Fix paths in scripts (PR #18). + +* Subsample datasets by batch if batch is defined (PR #22). + +## transfer from openproblems-v2 repository + +### NEW FUNCTIONALITY + +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the split, method and metric components. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `resources_test/denoising/pancreas` with `src/tasks/denoising/resources_test_scripts/pancreas.sh`. + +* `workflows/run`: Added nf-tower test script. (PR #205) + +### V1 MIGRATION + +* `control_methods/no_denoising`: Migrated from v1. Extracted from baseline method + +* `control_methods/perfect_denoising`: Migrated from v1.Extracted from baseline method + +* `methods/alra`: Migrated from v1. Changed from python to R and uses lg_cpm normalised data instead of L1 sqrt + +* `methods/dca`: Migrated and adapted from v1. + +* `methods/knn_smoothing`: Migrated and adapted from v1. + +* `methods/magic`: Migrated from v1. + +* `metrics/mse`: Migrated from v1. + +* `metrics/poisson`: Migrated from v1. + +### Changes from V1 + +* Anndata layers are used to store data instead of obsm + +* extended the use of sparse data in methods unless it was not possible + +* process_dataset also removes unnecessary data from train and test datasets not needed by the methods and metrics. diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md new file mode 100644 index 0000000..74287af --- /dev/null +++ b/INSTRUCTIONS.md @@ -0,0 +1,73 @@ +# Instructions + +This is a guide on what to do after you have created a new task repository from the template. More in depth information about how to create a new task can be found in the [OpenProblems Documentation](https://openproblems.bio/documentation/create_task/). + +## First things first + +* Update the `_viash.yaml` file with the correct task information. +* Update the `src/api/task_info.yaml` file with the information you have provied in the task issue. + +## Resources + +THe OpenProblems team has provided some test resources that can be used to test the task. These resources are stored in the `resources` folder. The `scripts/download_resources.sh` script can be used to download these resources. + +If these resources are not sufficient, you can add more resources to the `resources` folder. The `scripts/download_resources.sh` script can be updated to download these resources. + + + + + + + +#!/bin/bash + +echo "This script is not supposed to be run directly." +echo "Please run the script step-by-step." +exit 1 + +# sync resources +scripts/download_resources.sh + +# create a new component +method_id="my_metric" +method_lang="python" # change this to "r" if need be + +common/create_component/create_component -- \ + --language "$method_lang" \ + --name "$method_id" + +# TODO: fill in required fields in src/task/methods/foo/config.vsh.yaml +# TODO: edit src/task/methods/foo/script.py/R + +# test the component +viash test src/task/methods/$method_id/config.vsh.yaml + +# rebuild the container (only if you change something to the docker platform) +# You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels +viash run src/task/methods/$method_id/config.vsh.yaml -- \ + ---setup cachedbuild ---verbose + +# run the method (using parquet as input) +viash run src/task/methods/$method_id/config.vsh.yaml -- \ + --de_train "resources/neurips-2023-kaggle/de_train.parquet" \ + --id_map "resources/neurips-2023-kaggle/id_map.csv" \ + --output "output/prediction.parquet" + +# run the method (using h5ad as input) +viash run src/task/methods/$method_id/config.vsh.yaml -- \ + --de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \ + --id_map "resources/neurips-2023-kaggle/id_map.csv" \ + --output "output/prediction.parquet" + +# run evaluation metric +viash run src/task/metrics/mean_rowwise_error/config.vsh.yaml -- \ + --de_test "resources/neurips-2023-kaggle/de_test.parquet" \ + --prediction "output/prediction.parquet" \ + --output "output/score.h5ad" + +# print score on kaggle test dataset +python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)' \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3a85904 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Open Problems in Single-Cell Analysis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a519d9f --- /dev/null +++ b/README.md @@ -0,0 +1,289 @@ +# Denoising + + + + +Removing noise in sparse single-cell RNA-sequencing count data + +Repository: +[openproblems-bio/task_denoising](https://github.com/openproblems-bio/task_denoising) + +## Description + +A key challenge in evaluating denoising methods is the general lack of a +ground truth. A recent benchmark study ([Hou et al., +2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) +relied on flow-sorted datasets, mixture control experiments ([Tian et +al., 2019](https://www.nature.com/articles/s41592-019-0425-8)), and +comparisons with bulk RNA-Seq data. Since each of these approaches +suffers from specific limitations, it is difficult to combine these +different approaches into a single quantitative measure of denoising +accuracy. Here, we instead rely on an approach termed molecular +cross-validation (MCV), which was specifically developed to quantify +denoising accuracy in the absence of a ground truth ([Batson et al., +2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the +observed molecules in a given scRNA-Seq dataset are first partitioned +between a *training* and a *test* dataset. Next, a denoising method is +applied to the training dataset. Finally, denoising accuracy is measured +by comparing the result to the test dataset. The authors show that both +in theory and in practice, the measured denoising accuracy is +representative of the accuracy that would be obtained on a ground truth +dataset. + +## Authors & contributors + +| name | roles | +|:------------------|:-------------------| +| Wesley Lewis | author, maintainer | +| Scott Gigante | author, maintainer | +| Robrecht Cannoodt | author | +| Kai Waldrant | contributor | + +## API + +``` mermaid +flowchart TB + file_common_dataset("Common Dataset") + comp_data_processor[/"Data processor"/] + file_test("Test data") + file_train("Training data") + comp_control_method[/"Control Method"/] + comp_metric[/"Metric"/] + comp_method[/"Method"/] + file_prediction("Denoised data") + file_score("Score") + file_common_dataset---comp_data_processor + comp_data_processor-->file_test + comp_data_processor-->file_train + file_test---comp_control_method + file_test---comp_metric + file_train---comp_control_method + file_train---comp_method + comp_control_method-->file_prediction + comp_metric-->file_score + comp_method-->file_prediction + file_prediction---comp_metric +``` + +## File format: Common Dataset + +A subset of the common dataset. + +Example file: `resources_test/common/cxg_immune_cell_atlas/dataset.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch' + layers: 'counts' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["batch"]` | `string` | (*Optional*) Batch information. | +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | + +
+ +## Component type: Data processor + +A denoising dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A subset of the common dataset. | +| `--output_train` | `file` | (*Output*) The subset of molecules used for the training dataset. | +| `--output_test` | `file` | (*Output*) The subset of molecules used for the test dataset. | + +
+ +## File format: Test data + +The subset of molecules used for the test dataset + +Example file: +`resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad` + +Format: + +
+ + AnnData object + layers: 'counts' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'train_sum' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["train_sum"]` | `integer` | The total number of counts in the training dataset. | + +
+ +## File format: Training data + +The subset of molecules used for the training dataset + +Example file: +`resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad` + +Format: + +
+ + AnnData object + layers: 'counts' + uns: 'dataset_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:--------------------|:----------|:-------------------------------------| +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | + +
+ +## Component type: Control Method + +A control method. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_train` | `file` | The subset of molecules used for the training dataset. | +| `--input_test` | `file` | The subset of molecules used for the test dataset. | +| `--output` | `file` | (*Output*) A denoised dataset as output by a method. | + +
+ +## Component type: Metric + +A metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_test` | `file` | The subset of molecules used for the test dataset. | +| `--input_prediction` | `file` | A denoised dataset as output by a method. | +| `--output` | `file` | (*Output*) File indicating the score of a metric. | + +
+ +## Component type: Method + +A method. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_train` | `file` | The subset of molecules used for the training dataset. | +| `--output` | `file` | (*Output*) A denoised dataset as output by a method. | + +
+ +## File format: Denoised data + +A denoised dataset as output by a method. + +Example file: +`resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad` + +Format: + +
+ + AnnData object + layers: 'denoised' + uns: 'dataset_id', 'method_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---------------------|:----------|:-------------------------------------| +| `layers["denoised"]` | `integer` | denoised data. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +File indicating the score of a metric. + +Example file: +`resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/_viash.yaml b/_viash.yaml new file mode 100644 index 0000000..9466d58 --- /dev/null +++ b/_viash.yaml @@ -0,0 +1,77 @@ +name: task_denoising +organization: openproblems-bio +version: 1.0.0 +license: MIT +label: Denoising +keywords: [single-cell, openproblems, benchmark, denoising] +summary: "Removing noise in sparse single-cell RNA-sequencing count data" +description: | + A key challenge in evaluating denoising methods is the general lack of a ground truth. A + recent benchmark study ([Hou et al., + 2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) + relied on flow-sorted datasets, mixture control experiments ([Tian et al., + 2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk + RNA-Seq data. Since each of these approaches suffers from specific limitations, it is + difficult to combine these different approaches into a single quantitative measure of + denoising accuracy. Here, we instead rely on an approach termed molecular + cross-validation (MCV), which was specifically developed to quantify denoising accuracy + in the absence of a ground truth ([Batson et al., + 2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules + in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* + dataset. Next, a denoising method is applied to the training dataset. Finally, denoising + accuracy is measured by comparing the result to the test dataset. The authors show that + both in theory and in practice, the measured denoising accuracy is representative of the + accuracy that would be obtained on a ground truth dataset. +links: + issue_tracker: https://github.com/openproblems-bio/task_denoising/issues + repository: https://github.com/openproblems-bio/task_denoising + docker_registry: ghcr.io +info: + image: thumbnail.svg + motivation: | + Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present + in each cell. As a result, the measurements (UMI counts) observed for each gene and each + cell are associated with generally high levels of technical noise ([Grün et al., + 2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of + estimating the true expression level of each gene in each cell. In the single-cell + literature, this task is also referred to as *imputation*, a term which is typically + used for missing data problems in statistics. Similar to the use of the terms "dropout", + "missing data", and "technical zeros", this terminology can create confusion about the + underlying measurement process ([Sarkar and Stephens, + 2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/task_denoising/ + dest: resources_test/task_denoising + - type: s3 + path: s3://openproblems-data/resources_test/common/ + dest: resources_test/common +authors: + - name: "Wesley Lewis" + roles: [author, maintainer] + info: + github: wes-lewis + - name: "Scott Gigante" + roles: [author, maintainer] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Robrecht Cannoodt + roles: [author] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Kai Waldrant + roles: [contributor] + info: + github: KaiWaldrant + orcid: "0009-0003-8555-1361" +repositories: + - name: core + type: github + repo: openproblems-bio/core + tag: build/main + path: viash/core +viash_version: 0.9.0 +config_mods: | + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } diff --git a/common/CHANGELOG.md b/common/CHANGELOG.md new file mode 100644 index 0000000..1d9a978 --- /dev/null +++ b/common/CHANGELOG.md @@ -0,0 +1,10 @@ +# common-resources 0.1.0 + +Initial release of the common resources. Initial components: + +* `src/component_tests`: Directory containing component testing scripts. +* `src/create_component`: Quickly create a component based on the task API files. +* `src/create_task_readme`: Create the task README.md based on the task API files. +* `src/helper_functions`: Directory with general helper functions that are used by the common resources. +* `src/nextflow_helpers`: Directory containing general nextflow files to support the nextflow workflows. +* `src/sync_resources`: Component to sync resources from AWS s3. diff --git a/common/LICENSE b/common/LICENSE new file mode 100644 index 0000000..3a85904 --- /dev/null +++ b/common/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Open Problems in Single-Cell Analysis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/common/README.md b/common/README.md new file mode 100644 index 0000000..8a9baac --- /dev/null +++ b/common/README.md @@ -0,0 +1,35 @@ +# common_resources +This repo contains common resources that can be used for OpenProblems v2 tasks. + +## Usage + +> [!NOTE] +> The following instructions are not required when using the [task_template](https://github.com/openproblems-bio/task_template) repository to create your task repository. + +To use the resources in this repository, you will need to add this as a submodule to the task repository. + +You can do this by running the following command: + +```bash +git submodule add git@github.com:openproblems-bio/common_resources.git common +``` + +## Update + +To update the repository with the latest changes from in the submodule, you can run the following command: + +```bash +git submodule update --remote +``` + +## Initialize + +When cloning a repository with a submodule and there are no files visible, you will need to initialize by running the following command: + +```bash +git submodule update --init --recursive +``` + +## Resources + +The above information is also available on [working-with-submodules](https://github.blog/2016-02-01-working-with-submodules/). diff --git a/common/TASK_INSTRUCTIONS.md b/common/TASK_INSTRUCTIONS.md new file mode 100644 index 0000000..1a6b461 --- /dev/null +++ b/common/TASK_INSTRUCTIONS.md @@ -0,0 +1,111 @@ +# Instructions + +This is a guide on what to do after you have created a new task repository from the template. More in depth information about how to create a new task can be found in the [OpenProblems Documentation](https://openproblems.bio/documentation/create_task/). + +## Requirements + +A list of required software to start developing a new task can be found in the [OpenProblems Requirements](https://openproblems.bio/documentation/create_task/requirements). + +## First things first + +### `_viash.yaml` + +1. Update the `name` field to the name of the task in snake_case the name should start with `task_`. +2. Update the `description` field to a short description of the task. +3. Add a keyword to the `keywords` field that describes the task. +4. Update the `task_template` in the links fields to the name of the task from step 1. +5. Update the info fields to the text from the task issue. +6. Replace the task_template to the name of the task in `info.name`. +7. Update the authors of the task. + +After performing the steps above you can remove the comments in the `_viash.yaml` file. + +### `common` submodule + +Check the instructions [here](README.md) for more information. + +## Resources + +The OpenProblems team has provided some test resources that can be used to test the task. These resources are stored in the `resources_test` folder. The `scripts/download_resources.sh` script can be used to download these resources. +If these resources are not sufficient, you can add more resources to the `resources_test` folder. The `scripts/download_resources.sh` script can be updated to download these resources. When using new test resources let the OP team know so they can be added to the s3 bucket. + +```bash +scripts/download_resources.sh +``` + +## Next steps + +### API files ([docs](https://openproblems.bio/documentation/create_task/design_api)) + +Update the API files in the `src/api` folder. These files define the input and output of the methods and metrics. + +### Components ([docs](https://openproblems.bio/documentation/create_task/create_components)) + +To create a component, you can run the respective script in the `script` directory. Before running the script make sure to update the variables `task_name`, `component_name` and `component_lang` and save the file. For additional components you will only need to update the `component_name` and `component_lang` variables. + +```bash +scripts/add_a_control_method.sh +``` + +```bash +scripts/add_a_method.sh +``` + +```bash +scripts/add_a_metric.sh +``` + +For each type of component there already is a first component created that you can modify. + +1. Update the `.info` fields in the `config.vsh.yaml`. +2. Add any component specific arguments to the `config.vsh.yaml` file. +3. Add any additional resources that are required for the component. +4. Update the docker engine image setup if additional packages are required. +5. If you know the required memory and or CPU you can adjust the nextflow `.directive.labels` field. In addition if your component requires a GPU you can add the `gpu` label to the field. +6. Update the `script.py` or `script.R` file with the code for the component. + +> [!NOTE] +> You can remove the comments in the `config.vsh.yaml` file after you have updated the file. + +### Testing Components ([docs](https://openproblems.bio/documentation/create_component/run_tests)) + +You can test the component by running the following command: + +```bash +viash test /path/to/config.vsh.yaml +``` + +Y0u can also test all components by running the following command: + +```bash +scripts/test_all_components.sh +``` + +It is possible to customise the command in the above script by adding a `-q` argument to only perform the test on for example methods e.g. ` -q methods`. + + +## Dataset processor ([docs](https://openproblems.bio/documentation/create_task/dataset_processor)) + +The dataset processor is a script that removes all unnecessary info from the dataset for your task. This info is defined in the `api/file_common_dataset.yaml` file. From this filtered dataset several files are created that are used by the methods and metrics. Safeguarding data leaks and laking sure the structure of the data cannot be altered for a method or a metric. + +To create the data processor there is no template available. You can follow the guideline in the documentation. Store the processor in the `src/process_dataset` folder. + +Be sure to update the `file_common_dataset.yaml` file with the correct information required for the methods/metrics. + +> [!IMPORTANT] +> When using your own datasets please advise the OpenProblems team on how to add these datasets to the s3 bucket. +> As the dataset processor should make use of the `common` datasets folder in the `resources` or `resources_test` directory. + +To create the resources and test_resources for the task we will create a nextflow workflow that will process the datasets. This workflow will be created together with the OpenProblems team. + +## README + +To create the task `README` file preform following command: + +```bash +scripts/create_readme.sh +``` + +## Benchmarking ([docs](https://openproblems.bio/documentation/create_task/create_workflow)) + +When you are finished with creating your components and dataset processor you can create a workflow to benchmark the components. This workflow will be created together with the OpenProblems team. diff --git a/common/component_tests/check_config.py b/common/component_tests/check_config.py new file mode 100644 index 0000000..1e6a7ac --- /dev/null +++ b/common/component_tests/check_config.py @@ -0,0 +1,167 @@ +import re +from typing import Dict, List, Union +import openproblems + +## VIASH START +meta = { + "config" : "foo" +} +## VIASH END + +# TODO: check whether arguments have: +# - label +# - summary +# - description (optional) +# - example +# If defined, check whether these are in the right format: +# - info.file_format +# - info.file_format.file_type +# - slots, columns, depending on the file_type + +## CONSTANTS AND HELPER FUNCTIONS +NAME_MAXLEN = 50 +LABEL_MAXLEN = 50 +SUMMARY_MAXLEN = 400 +DESCRIPTION_MAXLEN = 5000 + +TIME_LABELS = ["lowtime", "midtime", "hightime", "veryhightime"] +MEM_LABELS = ["lowmem", "midmem", "highmem", "veryhighmem"] +CPU_LABELS = ["lowcpu", "midcpu", "highcpu", "veryhighcpu"] + +def check_url(url: str) -> bool: + import requests + from urllib3.util.retry import Retry + from requests.adapters import HTTPAdapter + + # configure retry strategy + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + + get = session.head(url) + + if get.ok or get.status_code == 429: # 429 rejected, too many requests + return True + else: + return False + +def check_references(references: Dict[str, Union[str, List[str]]]) -> None: + doi = references.get("doi") + bibtex = references.get("bibtex") + + assert doi or bibtex, "One of .references.doi or .references.bibtex should be defined" + + if doi: + if not isinstance(doi, list): + doi = [doi] + for d in doi: + assert re.match(r"^10.\d{4,9}/[-._;()/:A-Za-z0-9]+$", d), f"Invalid DOI format: {doi}" + assert check_url(f"https://doi.org/{d}"), f"DOI '{d}' is not reachable" + + if bibtex: + if not isinstance(bibtex, list): + bibtex = [bibtex] + for b in bibtex: + assert re.match(r"^@.*{.*", b), f"Invalid bibtex format: {b}" + +def check_links(links: Dict[str, Union[str, List[str]]], required = []) -> None: + if not links: + return + + for expected_link in required: + assert expected_link in links, f"Link .links.{expected_link} is not defined" + + for link_type, link in links.items(): + if link_type != "docker_registry": + assert check_url(link), f"Link .links.{link_type} URL '{link}' is not reachable" + +def check_info(this_info: Dict, this_config: Dict, comp_type: str) -> None: + # check label, summary, description, name + metadata_field_lengths = { + "name": NAME_MAXLEN, + "label": LABEL_MAXLEN, + "summary": SUMMARY_MAXLEN, + "description": DESCRIPTION_MAXLEN + } + + for field, max_length in metadata_field_lengths.items(): + value = this_info.get(field) + if comp_type != "metric": + value = this_config.get(field) or value + assert value, f"Metadata field '{field}' is not defined" + assert "FILL IN:" not in value, f"Metadata field '{field}' not filled in" + assert len(value) <= max_length, f"Metadata field '{field}' should not exceed {max_length} characters" + + # check links + links = this_info.get("links") or this_config.get("links") or {} + required_links = [] + if comp_type == "method": + required_links = ["documentation", "repository"] + check_links(links, required_links) + + # check references + references = this_info.get("references") or {} + if comp_type != "metric": + references = this_config.get("references") or references + if comp_type != "control_method" or references: + print("Check references fields", flush=True) + check_references(references) + +## UNIT TEST CHECKS +print("Load config data", flush=True) +config = openproblems.project.read_viash_config(meta["config"]) +info = config.get("info", {}) +comp_type = info.get("type") + +print("Check .namespace", flush=True) +assert config.get("namespace"), ".namespace is not defined" + +print("Check .info.type", flush=True) +expected_types = ["method", "control_method", "metric"] +assert comp_type in expected_types, ".info.type should be equal to 'method' or 'control_method'" + +print("Check component metadata", flush=True) +if comp_type == "metric": + metric_infos = info.get("metrics", []) + assert metric_infos, ".info.metrics is not defined" + + for metric_info in metric_infos: + check_info(metric_info, config, comp_type=comp_type) +else: + check_info(info, config, comp_type=comp_type) + +if "preferred_normalization" in info: + print("Checking contents of .info.preferred_normalization", flush=True) + norm_methods = ["log_cpm", "log_cp10k", "counts", "log_scran_pooling", "sqrt_cpm", "sqrt_cp10k", "l1_sqrt"] + assert info["preferred_normalization"] in norm_methods, ".info['preferred_normalization'] not one of '" + "', '".join(norm_methods) + "'." + +if "variants" in info: + print("Checking contents of .info.variants", flush=True) + arg_names = [arg["clean_name"] for arg in config["all_arguments"]] + ["preferred_normalization"] + + for paramset_id, paramset in info["variants"].items(): + if paramset: + for arg_id in paramset: + assert arg_id in arg_names, f"Argument '{arg_id}' in `.info.variants['{paramset_id}']` is not an argument in `.arguments`." + +# Check runners +runners = config.get("runners", []) + +print("Check Nextflow runner", flush=True) +nextflow_runner = next( + (runner for runner in runners if runner["type"] == "nextflow"), + None +) + +assert nextflow_runner, ".runners does not contain a nextflow runner" +assert nextflow_runner.get("directives"), "directives not a field in nextflow runner" +nextflow_labels = nextflow_runner["directives"].get("label") +assert nextflow_labels, "label not a field in nextflow runner directives" + +assert [label for label in nextflow_labels if label in TIME_LABELS], "time label not filled in" +assert [label for label in nextflow_labels if label in MEM_LABELS], "mem label not filled in" +assert [label for label in nextflow_labels if label in CPU_LABELS], "cpu label not filled in" + +print("All checks succeeded!", flush=True) diff --git a/common/component_tests/run_and_check_output.py b/common/component_tests/run_and_check_output.py new file mode 100644 index 0000000..7fc1b16 --- /dev/null +++ b/common/component_tests/run_and_check_output.py @@ -0,0 +1,182 @@ +import anndata as ad +import pandas as pd +import subprocess +from os import path +import re +import openproblems + +## VIASH START +meta = { + "executable": "target/docker/methods/lstm_gru_cnn_ensemble/lstm_gru_cnn_ensemble", + "config": "target/docker/methods/lstm_gru_cnn_ensemble/.config.vsh.yaml", + "resources_dir": "resources" +} +## VIASH END + +# helper functions +def run_component(cmd): + print(f">> Running script as test", flush=True) + out = subprocess.run(cmd) + + assert out.returncode == 0, f"Script exited with an error. Return code: {out.returncode}" + +def check_input_files(arguments): + print(">> Checking whether input files exist", flush=True) + for arg in arguments: + if arg["type"] == "file" and arg["direction"] == "input" and arg["required"]: + assert not arg["must_exist"] or path.exists(arg["value"]), f"Input file '{arg['value']}' does not exist" + +def check_output_files(arguments): + print(">> Checking whether output file exists", flush=True) + for arg in arguments: + if arg["type"] == "file" and arg["direction"] == "output" and arg["required"]: + assert not arg["must_exist"] or path.exists(arg["value"]), f"Output file '{arg['value']}' does not exist" + + print(">> Reading h5ad files and checking formats", flush=True) + for arg in arguments: + if arg["type"] != "file" or arg["direction"] != "output": + continue + check_format(arg) + +def check_format(arg): + arg_info = arg.get("info") or {} + if arg["type"] == "file": + arg_format = arg_info.get("format", {}) + file_type = arg_format.get("type") or arg_info.get("file_type") + + if file_type == "h5ad": + print(f"Reading and checking {arg['clean_name']}", flush=True) + + # try to read as an anndata, else as a parquet file + adata = ad.read_h5ad(arg["value"]) + + print(f" {adata}") + + check_h5ad_slots(adata, arg) + elif file_type in ["parquet", "csv", "tsv"]: + print(f"Reading and checking {arg['clean_name']}", flush=True) + + if file_type == "csv": + df = pd.read_csv(arg["value"]) + if file_type == "tsv": + df = pd.read_csv(arg["value"], sep="\t") + else: + df = pd.read_parquet(arg["value"]) + print(f" {df}") + + check_df_columns(df, arg) + + +def check_h5ad_slots(adata, arg): + """Check whether an AnnData file contains all for the required + slots in the corresponding .info.format field. + """ + arg_info = arg.get("info") or {} + arg_format = arg_info.get("format") or arg_info.get("slots") or {} + for struc_name, items in arg_format.items(): + # skip the type field + if struc_name == "type": + continue + + struc_x = getattr(adata, struc_name) + + if struc_name == "X": + if items.get("required", True): + assert struc_x is not None,\ + f"File '{arg['value']}' is missing slot .{struc_name}" + + else: + for item in items: + if item.get("required", True): + assert item["name"] in struc_x,\ + f"File '{arg['value']}' is missing slot .{struc_name}['{item['name']}']" + +def check_df_columns(df, arg): + """Check whether a DataFrame contains all for the required + columns in the corresponding .info.columns field. + """ + arg_info = arg.get("info") or {} + arg_format = arg_info.get("format", {}) + arg_columns = arg_format.get("columns") or arg_info.get("columns") or [] + for item in arg_columns: + if item.get("required", True): + assert item['name'] in df.columns,\ + f"File '{arg['value']}' is missing column '{item['name']}'" + +def get_argument_sets(config): + # get resources + arguments = [] + + for arg in config["all_arguments"]: + new_arg = arg.copy() + arg_info = new_arg.get("info") or {} + + # use example to find test resource file + if arg["type"] == "file": + if arg["direction"] == "input": + value = f"{meta['resources_dir']}/{arg['example'][0]}" + else: + example = arg.get("example", ["example"])[0] + ext_res = re.search(r"\.(\w+)$", example) + if ext_res: + value = f"{arg['clean_name']}.{ext_res.group(1)}" + else: + value = f"{arg['clean_name']}" + new_arg["value"] = value + elif "test_default" in arg_info: + new_arg["value"] = arg_info["test_default"] + + arguments.append(new_arg) + + config_info = config.get("info") or {} + if "test_setup" not in config_info: + argument_sets = {"run": arguments} + else: + test_setup = config_info["test_setup"] + argument_sets = {} + for name, test_instance in test_setup.items(): + new_arguments = [] + for arg in arguments: + new_arg = arg.copy() + if arg["clean_name"] in test_instance: + val = test_instance[arg["clean_name"]] + if new_arg["type"] == "file" and new_arg["direction"] == "input": + val = f"{meta['resources_dir']}/{val}" + new_arg["value"] = val + new_arguments.append(new_arg) + argument_sets[name] = new_arguments + + return argument_sets + +def generate_cmd_args(argument_set): + cmd_args = [] + for arg in argument_set: + if "value" in arg: + value = arg["value"] + if arg["multiple"] and isinstance(value, list): + value = arg["multiple_sep"].join(value) + cmd_args.extend([arg["name"], str(value)]) + return cmd_args + +# read viash config +config = openproblems.project.read_viash_config(meta["config"]) + +# get argument sets +argument_sets = get_argument_sets(config) + +# run component for each argument set +for argset_name, argset_args in argument_sets.items(): + print(f">> Running test '{argset_name}'", flush=True) + # construct command + cmd = [ meta["executable"] ] + generate_cmd_args(argset_args) + + # check input files + check_input_files(argset_args) + + # run component + run_component(cmd) + + # check output files + check_output_files(argset_args) + +print("All checks succeeded!", flush=True) \ No newline at end of file diff --git a/common/helper_functions/subset_h5ad_by_format.py b/common/helper_functions/subset_h5ad_by_format.py new file mode 100644 index 0000000..bf3b796 --- /dev/null +++ b/common/helper_functions/subset_h5ad_by_format.py @@ -0,0 +1,68 @@ +"""Helper functions related to subsetting AnnData objects based on the file format +specifications in the .config.vsh.yaml and slot mapping overrides.""" + +# create new anndata objects according to api spec +def subset_h5ad_by_format(adata, config, arg_name, field_rename_dict = {}): + """Create new anndata object according to slot info specifications. + + Arguments: + adata -- An AnnData object to subset (required) + config -- A Viash config object as read by openproblems.project.read_viash_config (required) + arg_name -- The name of the argument in the config file that specifies the output format (required) + field_rename_dict -- A mapping between the slots of the source h5ad and the slots of the destination h5ad. + Example of slot_mapping: + ``` + slot_mapping = { + "layers": { + "counts": par["layer_counts"], + }, + "obs": { + "cell_type": par["obs_cell_type"], + "batch": par["obs_batch"], + } + } + """ + import pandas as pd + import anndata as ad + + assert isinstance(adata, ad.AnnData), "adata must be an AnnData object" + assert isinstance(config, dict), "config must be a dictionary" + + # find argument + arg = next((x for x in config["all_arguments"] if x["clean_name"] == arg_name), None) + assert arg, f"Argument '{arg_name}' not found in config" + + # find file format + file_format = (arg.get("info") or {}).get("format") + assert file_format, f"Argument '{arg_name}' has no .info.format" + + # find file format type + file_format_type = file_format.get("type") + assert file_format_type == "h5ad", "format must be a h5ad type" + + structs = ["layers", "obs", "var", "uns", "obsp", "obsm", "varp", "varm"] + kwargs = {} + + for struct in structs: + struct_format = file_format.get(struct, {}) + struct_rename = field_rename_dict.get(struct, {}) + + # fetch data from adata + data = {} + for field_format in struct_format: + dest_name = field_format["name"] + # where to find the data. if the dest_name is in the rename dict, use the renamed name + # as the source name, otherwise use the dest_name as the source name + src_name = struct_rename.get(dest_name, dest_name) + data[dest_name] = getattr(adata, struct)[src_name] + + if len(data) > 0: + if struct in ['obs', 'var']: + data = pd.concat(data, axis=1) + kwargs[struct] = data + elif struct in ['obs', 'var']: + # if no columns need to be copied, we still need an 'obs' and a 'var' + # to help determine the shape of the adata + kwargs[struct] = getattr(adata, struct).iloc[:,[]] + + return ad.AnnData(**kwargs) diff --git a/common/library.bib b/common/library.bib new file mode 100644 index 0000000..313bfff --- /dev/null +++ b/common/library.bib @@ -0,0 +1,1722 @@ +@misc{10x2018pbmc, + title = {1k PBMCs from a Healthy Donor (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2018}, + url = {https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0} +} + + +@misc{10x2019pbmc, + title = {5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-B Antibodies (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0} +} + + +@article{agostinis2022newwave, + doi = {10.1093/bioinformatics/btac149}, + url = {https://doi.org/10.1093/bioinformatics/btac149}, + year = {2022}, + month = {Mar.}, + publisher = {Oxford University Press ({OUP})}, + volume = {38}, + number = {9}, + pages = {2648--2650}, + author = {Federico Agostinis and Chiara Romualdi and Gabriele Sales and Davide Risso}, + editor = {Yann Ponty}, + title = {NewWave: a scalable R/Bioconductor package for the dimensionality reduction and batch effect removal of single-cell {RNA}-seq data}, + journal = {Bioinformatics} +} + + +@article{agrawal2021mde, + title = {Minimum-Distortion Embedding}, + author = {Akshay Agrawal and Alnur Ali and Stephen Boyd}, + year = {2021}, + journal = {Foundations and Trends{\textregistered} in Machine Learning}, + publisher = {Now Publishers}, + volume = {14}, + number = {3}, + pages = {211--378}, + doi = {10.1561/2200000090}, + url = {https://doi.org/10.1561/2200000090} +} + + +@article{aliee2021autogenes, + title = {{AutoGeneS}: Automatic gene selection using multi-objective optimization for {RNA}-seq deconvolution}, + author = {Hananeh Aliee and Fabian J. Theis}, + year = {2021}, + month = {Jul.}, + journal = {Cell Systems}, + publisher = {Elsevier {BV}}, + volume = {12}, + number = {7}, + pages = {706--715.e4}, + doi = {10.1016/j.cels.2021.05.006}, + url = {https://doi.org/10.1016/j.cels.2021.05.006} +} + + +@inproceedings{amelio2015normalized, + doi = {10.1145/2808797.2809344}, + url = {https://doi.org/10.1145/2808797.2809344}, + year = {2015}, + month = {Aug.}, + publisher = {{ACM}}, + author = {Alessia Amelio and Clara Pizzuti}, + title = {Is Normalized Mutual Information a Fair Measure for Comparing Community Detection Methods?}, + booktitle = {Proceedings of the 2015 {IEEE}/{ACM} International Conference on Advances in Social Networks Analysis and Mining 2015} +} + + +@article{andersson2020single, + title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, + author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, + year = {2020}, + month = {Oct.}, + journal = {Communications Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {3}, + number = {1}, + doi = {10.1038/s42003-020-01247-y}, + url = {https://doi.org/10.1038/s42003-020-01247-y} +} + + +@string{apr = {Apr.}} + + +@string{aug = {Aug.}} + + +@article{batson2019molecular, + title = {Molecular Cross-Validation for Single-Cell RNA-seq}, + author = {Batson, Joshua and Royer, Lo{\"\i}c and Webber, James}, + year = {2019}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/786269}, + url = {https://www.biorxiv.org/content/early/2019/09/30/786269}, + elocation-id = {786269}, + eprint = {https://www.biorxiv.org/content/early/2019/09/30/786269.full.pdf} +} + + +@article{biancalani2021deep, + title = {Deep learning and alignment of spatially resolved single-cell transcriptomes with Tangram}, + author = {Tommaso Biancalani and Gabriele Scalia and Lorenzo Buffoni and Raghav Avasthi and Ziqing Lu and Aman Sanger and Neriman Tokcan and Charles R. Vanderburg and {\AA}sa Segerstolpe and Meng Zhang and Inbal Avraham-Davidi and Sanja Vickovic and Mor Nitzan and Sai Ma and Ayshwarya Subramanian and Michal Lipinski and Jason Buenrostro and Nik Bear Brown and Duccio Fanelli and Xiaowei Zhuang and Evan Z. Macosko and Aviv Regev}, + year = {2021}, + month = {Oct.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {18}, + number = {11}, + pages = {1352--1362}, + doi = {10.1038/s41592-021-01264-7}, + url = {https://doi.org/10.1038/s41592-021-01264-7} +} + + +@article{bland2000odds, + title = {Statistics Notes: The odds ratio}, + author = {J. M. Bland}, + year = {2000}, + month = {May}, + journal = {{BMJ}}, + publisher = {{BMJ}}, + volume = {320}, + number = {7247}, + pages = {1468--1468}, + doi = {10.1136/bmj.320.7247.1468}, + url = {https://doi.org/10.1136/bmj.320.7247.1468} +} + + +@article{breiman2001random, + doi = {10.1023/a:1010933404324}, + url = {https://doi.org/10.1023/a:1010933404324}, + year = {2001}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {45}, + number = {1}, + pages = {5--32}, + author = {Leo Breiman}, + journal = {Machine Learning} +} + + +@article{bttner2018test, + title = {A test metric for assessing single-cell {RNA}-seq batch correction}, + author = {Maren B\"{u}ttner and Zhichao Miao and F. Alexander Wolf and Sarah A. Teichmann and Fabian J. Theis}, + year = {2018}, + month = {Dec.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {1}, + pages = {43--49}, + doi = {10.1038/s41592-018-0254-1}, + url = {https://doi.org/10.1038/s41592-018-0254-1} +} + + +@article{cabello2020singlecellsignalr, + title = {{SingleCellSignalR}: inference of intercellular networks from single-cell transcriptomics}, + author = {Simon Cabello-Aguilar and M{\'{e}}lissa Alame and Fabien Kon-Sun-Tack and Caroline Fau and Matthieu Lacroix and Jacques Colinge}, + year = {2020}, + month = {Mar.}, + journal = {Nucleic Acids Research}, + publisher = {Oxford University Press ({OUP})}, + volume = {48}, + number = {10}, + pages = {e55--e55}, + doi = {10.1093/nar/gkaa183}, + url = {https://doi.org/10.1093/nar/gkaa183} +} + + +@article{cable2021robust, + title = {Robust decomposition of cell type mixtures in spatial transcriptomics}, + author = {Dylan M. Cable and Evan Murray and Luli S. Zou and Aleksandrina Goeva and Evan Z. Macosko and Fei Chen and Rafael A. Irizarry}, + year = {2021}, + month = {Feb.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {4}, + pages = {517--526}, + doi = {10.1038/s41587-021-00830-w}, + url = {https://doi.org/10.1038/s41587-021-00830-w} +} + + +@misc{cannoodt2021viashfromscripts, + doi = {10.48550/ARXIV.2110.11494}, + url = {https://arxiv.org/abs/2110.11494}, + author = {Cannoodt, Robrecht and Cannoodt, Hendrik and Van de Kerckhove, Eric and Boschmans, Andy and De Maeyer, Dries and Verbeiren, Toni}, + keywords = {Software Engineering (cs.SE), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {Viash: from scripts to pipelines}, + publisher = {arXiv}, + year = {2021}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} +} + + +@article{cao2018joint, + title = {Joint profiling of chromatin accessibility and gene expression in thousands of single cells}, + author = {Junyue Cao and Darren A. Cusanovich and Vijay Ramani and Delasa Aghamirzaie and Hannah A. Pliner and Andrew J. Hill and Riza M. Daza and Jose L. McFaline-Figueroa and Jonathan S. Packer and Lena Christiansen and Frank J. Steemers and Andrew C. Adey and Cole Trapnell and Jay Shendure}, + year = {2018}, + month = {Sept.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {361}, + number = {6409}, + pages = {1380--1385}, + doi = {10.1126/science.aau0730}, + url = {https://doi.org/10.1126/science.aau0730} +} + + +@article{cao2020human, + title = {A human cell atlas of fetal gene expression}, + author = {Junyue Cao and Diana R. O'Day and Hannah A. Pliner and Paul D. Kingsley and Mei Deng and Riza M. Daza and Michael A. Zager and Kimberly A. Aldinger and Ronnie Blecher-Gonen and Fan Zhang and Malte Spielmann and James Palis and Dan Doherty and Frank J. Steemers and Ian A. Glass and Cole Trapnell and Jay Shendure}, + year = {2020}, + month = {Nov.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {370}, + number = {6518}, + doi = {10.1126/science.aba7721}, + url = {https://doi.org/10.1126/science.aba7721} +} + + +@article{chai2014root, + doi = {10.5194/gmdd-7-1525-2014}, + url = {https://doi.org/10.5194/gmdd-7-1525-2014}, + year = {2014}, + month = {Feb.}, + publisher = {Copernicus {GmbH}}, + author = {T. Chai and R. R. Draxler}, + title = {Root mean square error ({RMSE}) or mean absolute error ({MAE})?} +} + + +@article{chazarragil2021flexible, + doi = {10.1093/nar/gkab004}, + url = {https://doi.org/10.1093/nar/gkab004}, + year = {2021}, + month = {Feb.}, + publisher = {Oxford University Press ({OUP})}, + volume = {49}, + number = {7}, + pages = {e42--e42}, + author = {Ruben Chazarra-Gil and Stijn van~Dongen and Vladimir~Yu Kiselev and Martin Hemberg}, + title = {Flexible comparison of batch correction methods for single-cell {RNA}-seq using {BatchBench}}, + journal = {Nucleic Acids Research} +} + + +@article{chen2009local, + title = {Local Multidimensional Scaling for Nonlinear Dimension Reduction, Graph Drawing, and Proximity Analysis}, + author = {Lisha Chen and Andreas Buja}, + year = {2009}, + month = {Mar.}, + journal = {Journal of the American Statistical Association}, + publisher = {Informa {UK} Limited}, + volume = {104}, + number = {485}, + pages = {209--219}, + doi = {10.1198/jasa.2009.0111}, + url = {https://doi.org/10.1198/jasa.2009.0111} +} + + +@inproceedings{chen2016xgboost, + title = {{XGBoost}}, + author = {Tianqi Chen and Carlos Guestrin}, + year = {2016}, + month = {Aug.}, + booktitle = {Proceedings of the 22nd {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining}, + publisher = {{Acm}}, + doi = {10.1145/2939672.2939785}, + url = {https://doi.org/10.1145/2939672.2939785} +} + + +@article{cichocki2009fast, + title = {Fast Local Algorithms for Large Scale Nonnegative Matrix and Tensor Factorizations}, + author = {Andrzej Cichocki and Anh-Huy Phan}, + year = {2009}, + journal = {{IEICE} Transactions on Fundamentals of Electronics, Communications and Computer Sciences}, + publisher = {Institute of Electronics, Information and Communications Engineers ({IEICE})}, + volume = {E92-a}, + number = {3}, + pages = {708--721}, + doi = {10.1587/transfun.e92.a.708}, + url = {https://doi.org/10.1587/transfun.e92.a.708} +} + + +@article{coifman2006diffusion, + title = {Diffusion maps}, + author = {Ronald R. Coifman and St{\'{e}}phane Lafon}, + year = {2006}, + month = {Jul.}, + journal = {Applied and Computational Harmonic Analysis}, + publisher = {Elsevier {BV}}, + volume = {21}, + number = {1}, + pages = {5--30}, + doi = {10.1016/j.acha.2006.04.006}, + url = {https://doi.org/10.1016/j.acha.2006.04.006} +} + + +@article{cover1967nearest, + title = {Nearest neighbor pattern classification}, + author = {T. Cover and P. Hart}, + year = {1967}, + month = {Jan}, + journal = {{IEEE} Transactions on Information Theory}, + publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, + volume = {13}, + number = {1}, + pages = {21--27}, + doi = {10.1109/tit.1967.1053964}, + url = {https://doi.org/10.1109/tit.1967.1053964} +} + + +@inproceedings{davis2006prauc, + title = {The relationship between Precision-Recall and {ROC} curves}, + author = {Jesse Davis and Mark Goadrich}, + year = {2006}, + booktitle = {Proceedings of the 23rd international conference on Machine learning - {ICML} {\textquotesingle}06}, + publisher = {{ACM} Press}, + doi = {10.1145/1143844.1143874}, + url = {https://doi.org/10.1145/1143844.1143874} +} + + +@string{dec = {Dec.}} + +@article{Demetci2020scot, + author = {Pinar Demetci and Rebecca Santorella and Bj{\"o}rn Sandstede and William Stafford Noble and Ritambhara Singh}, + title = {Gromov-Wasserstein optimal transport to align single-cell multi-omics data}, + elocation-id = {2020.04.28.066787}, + year = {2020}, + doi = {10.1101/2020.04.28.066787}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787}, + eprint = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787.full.pdf}, + journal = {bioRxiv} +} + + +@article{dimitrov2022comparison, + title = {Comparison of methods and resources for cell-cell communication inference from single-cell {RNA}-Seq data}, + author = {Daniel Dimitrov and D{\'{e}}nes T\"{u}rei and Martin Garrido-Rodriguez and Paul L. Burmedi and James S. Nagai and Charlotte Boys and Ricardo O. Ramirez Flores and Hyojin Kim and Bence Szalai and Ivan G. Costa and Alberto Valdeolivas and Aur{\'{e}}lien Dugourd and Julio Saez-Rodriguez}, + year = {2022}, + month = {Jun.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-30755-0}, + url = {https://doi.org/10.1038/s41467-022-30755-0} +} + + +@article{donoho2017yearsdatascience, + doi = {10.1080/10618600.2017.1384734}, + url = {https://doi.org/10.1080/10618600.2017.1384734}, + year = {2017}, + month = {Oct.}, + publisher = {Informa {UK} Limited}, + volume = {26}, + number = {4}, + pages = {745--766}, + author = {David Donoho}, + title = {50 Years of Data Science}, + journal = {Journal of Computational and Graphical Statistics} +} + + +@article{efremova2020cellphonedb, + title = {{CellPhoneDB}: inferring cell{\textendash}cell communication from combined expression of multi-subunit ligand{\textendash}receptor complexes}, + author = {Mirjana Efremova and Miquel Vento-Tormo and Sarah A. Teichmann and Roser Vento-Tormo}, + year = {2020}, + month = {Feb.}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {4}, + pages = {1484--1506}, + doi = {10.1038/s41596-020-0292-x}, + url = {https://doi.org/10.1038/s41596-020-0292-x} +} + + +@article{emmons2016analysis, + title = {Analysis of Network Clustering Algorithms and Cluster Quality Metrics at Scale}, + volume = {11}, + ISSN = {1932-6203}, + url = {http://dx.doi.org/10.1371/journal.pone.0159161}, + doi = {10.1371/journal.pone.0159161}, + number = {7}, + journal = {PLOS ONE}, + publisher = {Public Library of Science (PLoS)}, + author = {Emmons, Scott and Kobourov, Stephen and Gallant, Mike and B\"{o}rner, Katy}, + editor = {Dovrolis, Constantine}, + year = {2016}, + month = jul, + pages = {e0159161} +} + + +@article{eraslan2019single, + title = {Single-cell {RNA}-seq denoising using a deep count autoencoder}, + author = {G\"{o}kcen Eraslan and Lukas M. Simon and Maria Mircea and Nikola S. Mueller and Fabian J. Theis}, + year = {2019}, + month = {Jan}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {10}, + number = {1}, + doi = {10.1038/s41467-018-07931-2}, + url = {https://doi.org/10.1038/s41467-018-07931-2} +} + + +@string{feb = {Feb.}} + + +@article{fix1989discriminatory, + doi = {10.2307/1403797}, + url = {https://doi.org/10.2307/1403797}, + year = {1989}, + month = {Dec.}, + publisher = {{JSTOR}}, + volume = {57}, + number = {3}, + pages = {238}, + author = {Evelyn Fix and J. L. Hodges}, + title = {Discriminatory Analysis. Nonparametric Discrimination: Consistency Properties}, + journal = {International Statistical Review / Revue Internationale de Statistique} +} + + +@article{gower1975generalized, + title = {Generalized procrustes analysis}, + author = {J. C. Gower}, + year = {1975}, + month = {Mar.}, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {1}, + pages = {33--51}, + doi = {10.1007/bf02291478}, + url = {https://doi.org/10.1007/bf02291478} +} + + +@article{grandini2020metrics, + title = {Metrics for Multi-Class Classification: an Overview}, + author = {Grandini, Margherita and Bagli, Enrico and Visani, Giorgio}, + year = {2020}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.2008.05756}, + url = {https://arxiv.org/abs/2008.05756}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + + +@article{granja2021archr, + title = {{ArchR} is a scalable software package for integrative single-cell chromatin accessibility analysis}, + author = {Jeffrey M. Granja and M. Ryan Corces and Sarah E. Pierce and S. Tansu Bagdatli and Hani Choudhry and Howard Y. Chang and William J. Greenleaf}, + year = {2021}, + month = {Feb.}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {3}, + pages = {403--411}, + doi = {10.1038/s41588-021-00790-6}, + url = {https://doi.org/10.1038/s41588-021-00790-6} +} + + +@article{grn2014validation, + title = {Validation of noise models for single-cell transcriptomics}, + author = {Dominic Gr\"{u}n and Lennart Kester and Alexander van Oudenaarden}, + year = {2014}, + month = {Apr.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {6}, + pages = {637--640}, + doi = {10.1038/nmeth.2930}, + url = {https://doi.org/10.1038/nmeth.2930} +} + + +@article{haghverdi2018batch, + title = {Batch effects in single-cell {RNA}-sequencing data are corrected by matching mutual nearest neighbors}, + author = {Laleh Haghverdi and Aaron T L Lun and Michael D Morgan and John C Marioni}, + year = {2018}, + month = {Apr.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {36}, + number = {5}, + pages = {421--427}, + doi = {10.1038/nbt.4091}, + url = {https://doi.org/10.1038/nbt.4091} +} + + +@article{hammarlund2018cengen, + title = {The {CeNGEN} Project: The Complete Gene Expression Map of an Entire Nervous System}, + author = {Marc Hammarlund and Oliver Hobert and David M. Miller and Nenad Sestan}, + year = {2018}, + month = {Aug.}, + journal = {Neuron}, + publisher = {Elsevier {BV}}, + volume = {99}, + number = {3}, + pages = {430--433}, + doi = {10.1016/j.neuron.2018.07.042}, + url = {https://doi.org/10.1016/j.neuron.2018.07.042} +} + + +@article{hansen2012removing, + title = {Adjusting batch effects in microarray expression data using empirical Bayes methods}, + author = {W. Evan Johnson and Cheng Li and Ariel Rabinovic}, + year = {2006}, + month = {Apr.}, + journal = {Biostatistics}, + publisher = {Oxford University Press ({OUP})}, + volume = {8}, + number = {1}, + pages = {118--127}, + doi = {10.1093/biostatistics/kxj037}, + url = {https://doi.org/10.1093/biostatistics/kxj037} +} + + +@article{hao2021integrated, + title = {Integrated analysis of multimodal single-cell data}, + author = {Yuhan Hao and Stephanie Hao and Erica Andersen-Nissen and William M. Mauck and Shiwei Zheng and Andrew Butler and Maddie J. Lee and Aaron J. Wilk and Charlotte Darby and Michael Zager and Paul Hoffman and Marlon Stoeckius and Efthymia Papalexi and Eleni P. Mimitou and Jaison Jain and Avi Srivastava and Tim Stuart and Lamar M. Fleming and Bertrand Yeung and Angela J. Rogers and Juliana M. McElrath and Catherine A. Blish and Raphael Gottardo and Peter Smibert and Rahul Satija}, + year = {2021}, + month = {Jun.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {184}, + number = {13}, + pages = {3573--3587.e29}, + doi = {10.1016/j.cell.2021.04.048}, + url = {https://doi.org/10.1016/j.cell.2021.04.048} +} + + +@article{hie2019efficient, + title = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama}, + author = {Brian Hie and Bryan Bryson and Bonnie Berger}, + year = {2019}, + month = {May}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {6}, + pages = {685--691}, + doi = {10.1038/s41587-019-0113-3}, + url = {https://doi.org/10.1038/s41587-019-0113-3} +} + + +@article{hinton1989connectionist, + title = {Connectionist learning procedures}, + author = {Geoffrey E. Hinton}, + year = {1989}, + month = {Sept.}, + journal = {Artificial Intelligence}, + publisher = {Elsevier {BV}}, + volume = {40}, + number = {1-3}, + pages = {185--234}, + doi = {10.1016/0004-3702(89)90049-0}, + url = {https://doi.org/10.1016/0004-3702(89)90049-0} +} + + +@book{hosmer2013applied, + title = {Applied logistic regression}, + author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, + year = {2013}, + publisher = {John Wiley \& Sons}, + volume = {398} +} + + +@article{hou2019scmatch, + title = {{scMatch}: a single-cell gene expression profile annotation tool using reference datasets}, + author = {Rui Hou and Elena Denisenko and Alistair R R Forrest}, + year = {2019}, + month = {Apr.}, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + volume = {35}, + number = {22}, + pages = {4688--4695}, + doi = {10.1093/bioinformatics/btz292}, + url = {https://doi.org/10.1093/bioinformatics/btz292}, + editor = {Janet Kelso} +} + + +@article{hou2020predicting, + title = {Predicting cell-to-cell communication networks using {NATMI}}, + author = {Rui Hou and Elena Denisenko and Huan Ting Ong and Jordan A. Ramilowski and Alistair R. R. Forrest}, + year = {2020}, + month = {Oct.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {1}, + doi = {10.1038/s41467-020-18873-z}, + url = {https://doi.org/10.1038/s41467-020-18873-z} +} + + +@article{hou2020systematic, + title = {A systematic evaluation of single-cell {RNA}-sequencing imputation methods}, + author = {Wenpin Hou and Zhicheng Ji and Hongkai Ji and Stephanie C. Hicks}, + year = {2020}, + month = {Aug.}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + doi = {10.1186/s13059-020-02132-x}, + url = {https://doi.org/10.1186/s13059-020-02132-x} +} + + +@article{hubert1985comparing, + doi = {10.1007/bf01908075}, + url = {https://doi.org/10.1007/bf01908075}, + year = {1985}, + month = {Dec.}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {2}, + number = {1}, + pages = {193--218}, + author = {Lawrence Hubert and Phipps Arabie}, + title = {Comparing partitions}, + journal = {Journal of Classification} +} + + +@string{jan = {Jan}} + + +@string{jul = {Jul.}} + + +@string{jun = {Jun.}} + + +@article{kendall1938new, + doi = {10.1093/biomet/30.1-2.81}, + url = {https://doi.org/10.1093/biomet/30.1-2.81}, + year = {1938}, + month = {Jun.}, + publisher = {Oxford University Press ({OUP})}, + volume = {30}, + number = {1-2}, + pages = {81--93}, + author = {M. G. KENDALL}, + title = {A new measure of rank correlation}, + journal = {Biometrika} +} + + +@article{kiselev2019challenges, + title = {Challenges in unsupervised clustering of single-cell {RNA}-seq data}, + author = {Vladimir Yu Kiselev and Tallulah S. Andrews and Martin Hemberg}, + year = {2019}, + month = {Jan}, + journal = {Nature Reviews Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {20}, + number = {5}, + pages = {273--282}, + doi = {10.1038/s41576-018-0088-9}, + url = {https://doi.org/10.1038/s41576-018-0088-9} +} + + +@article{kleshchevnikov2022cell2location, + title = {Cell2location maps fine-grained cell types in spatial transcriptomics}, + author = {Vitalii Kleshchevnikov and Artem Shmatko and Emma Dann and Alexander Aivazidis and Hamish W. King and Tong Li and Rasa Elmentaite and Artem Lomakin and Veronika Kedlian and Adam Gayoso and Mika Sarkin Jain and Jun Sung Park and Lauma Ramona and Elizabeth Tuck and Anna Arutyunyan and Roser Vento-Tormo and Moritz Gerstung and Louisa James and Oliver Stegle and Omer Ali Bayraktar}, + year = {2022}, + month = {Jan}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {5}, + pages = {661--671}, + doi = {10.1038/s41587-021-01139-4}, + url = {https://doi.org/10.1038/s41587-021-01139-4} +} + + +@article{korsunsky2019fast, + title = {Fast, sensitive and accurate integration of single-cell data with Harmony}, + author = {Ilya Korsunsky and Nghia Millard and Jean Fan and Kamil Slowikowski and Fan Zhang and Kevin Wei and Yuriy Baglaenko and Michael Brenner and Po-ru Loh and Soumya Raychaudhuri}, + year = {2019}, + month = {Nov.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {12}, + pages = {1289--1296}, + doi = {10.1038/s41592-019-0619-0}, + url = {https://doi.org/10.1038/s41592-019-0619-0} +} + + +@article{kraemer2018dimred, + title = {{dimRed} and {coRanking} - Unifying Dimensionality Reduction in R}, + author = {Guido Kraemer and Markus Reichstein and Miguel, D. Mahecha}, + year = {2018}, + journal = {The R Journal}, + publisher = {The R Foundation}, + volume = {10}, + number = {1}, + pages = {342}, + doi = {10.32614/rj-2018-039}, + url = {https://doi.org/10.32614/rj-2018-039} +} + + +@article{kruskal1964mds, + title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, + author = {J. B. Kruskal}, + year = {1964}, + month = {Mar.}, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {29}, + number = {1}, + pages = {1--27}, + doi = {10.1007/bf02289565}, + url = {https://doi.org/10.1007/bf02289565} +} + + +@article{lance2022multimodal, + title = {Multimodal single cell data integration challenge: results and lessons learned}, + author = {Lance, Christopher and Luecken, Malte D. and Burkhardt, Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh, Nikolay and Ryazantsev, Gleb and Ohler, Uwe and , and Pisco, Angela Oliveira and Bloom, Jonathan and Krishnaswamy, Smita and Theis, Fabian J.}, + year = {2022}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2022.04.11.487796}, + url = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796}, + elocation-id = {2022.04.11.487796}, + eprint = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796.full.pdf} +} + + +@article{lance2024predicting, + title = {Predicting cellular profiles across modalities in longitudinal single-cell data: An Open Problems competition}, + author = {...}, + year = {2024}, + journal = {In preparation}, +} + + +@book{lawson1995solving, + title = {Solving Least Squares Problems}, + author = {Charles L. Lawson and Richard J. Hanson}, + year = {1995}, + month = {Jan}, + publisher = {Society for Industrial and Applied Mathematics}, + doi = {10.1137/1.9781611971217}, + url = {https://doi.org/10.1137/1.9781611971217} +} + + +@article{lee2009quality, + title = {Quality assessment of dimensionality reduction: Rank-based criteria}, + author = {John A. Lee and Michel Verleysen}, + year = {2009}, + month = {Mar.}, + journal = {Neurocomputing}, + publisher = {Elsevier {BV}}, + volume = {72}, + number = {7-9}, + pages = {1431--1443}, + doi = {10.1016/j.neucom.2008.12.017}, + url = {https://doi.org/10.1016/j.neucom.2008.12.017} +} + + +@article{linderman2018zero, + title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, + author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/397588}, + url = {https://www.biorxiv.org/content/early/2018/08/22/397588}, + elocation-id = {397588}, + eprint = {https://www.biorxiv.org/content/early/2018/08/22/397588.full.pdf} +} + + +@article{lopez2018deep, + title = {Deep generative modeling for single-cell transcriptomics}, + author = {Romain Lopez and Jeffrey Regier and Michael B. Cole and Michael I. Jordan and Nir Yosef}, + year = {2018}, + month = {Nov.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {12}, + pages = {1053--1058}, + doi = {10.1038/s41592-018-0229-2}, + url = {https://doi.org/10.1038/s41592-018-0229-2} +} + + +@article{lopez2022destvi, + title = {{DestVI} identifies continuums of cell types in spatial transcriptomics data}, + author = {Romain Lopez and Baoguo Li and Hadas Keren-Shaul and Pierre Boyeau and Merav Kedmi and David Pilzer and Adam Jelinski and Ido Yofe and Eyal David and Allon Wagner and Can Ergen and Yoseph Addadi and Ofra Golani and Franca Ronchese and Michael I. Jordan and Ido Amit and Nir Yosef}, + year = {2022}, + month = {Apr.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {9}, + pages = {1360--1369}, + doi = {10.1038/s41587-022-01272-8}, + url = {https://doi.org/10.1038/s41587-022-01272-8} +} + + +@article{lotfollahi2020query, + title = {Query to reference single-cell integration with transfer learning}, + author = {Lotfollahi, Mohammad and Naghipourfar, Mohsen and Luecken, Malte D. and Khajavi, Matin and B{\"u}ttner, Maren and Avsec, Ziga and Misharin, Alexander V. and Theis, Fabian J.}, + year = {2020}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2020.07.16.205997}, + url = {https://doi.org/10.1101/2020.07.16.205997}, + elocation-id = {2020.07.16.205997}, + eprint = {https://www.biorxiv.org/content/early/2020/07/16/2020.07.16.205997.full.pdf} +} + + +@article{luecken2022benchmarking, + title = {Benchmarking atlas-level data integration in single-cell genomics}, + author = {Malte D. Luecken and M. B\"{u}ttner and K. Chaichoompu and A. Danese and M. Interlandi and M. F. Mueller and D. C. Strobl and L. Zappia and M. Dugas and M. Colom{\'{e}}-Tatch{\'{e}} and Fabian J. Theis}, + year = {2021}, + month = {Dec.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {1}, + pages = {41--50}, + doi = {10.1038/s41592-021-01336-8}, + url = {https://doi.org/10.1038/s41592-021-01336-8} +} + + +@article{lueks2011evaluate, + title = {How to Evaluate Dimensionality Reduction? - Improving the Co-ranking Matrix}, + author = {Lueks, Wouter and Mokbel, Bassam and Biehl, Michael and Hammer, Barbara}, + year = {2011}, + journal = {arXiv}, + doi = {10.48550/ARXIV.1110.3917}, + url = {https://arxiv.org/abs/1110.3917}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (cs.LG), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + + +@misc{lun2019fastmnn, + title = {A description of the theory behind the fastMNN algorithm}, + author = {Lun, Aaron}, + year = {2019}, + url = {https://marionilab.github.io/FurtherMNN2018/theory/description.html} +} + + +@string{mar = {Mar.}} + + +@string{may = {May}} + + +@article{mcinnes2018umap, + title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}, + author = {McInnes, Leland and Healy, John and Melville, James}, + year = {2018}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.1802.03426}, + url = {https://arxiv.org/abs/1802.03426}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Computational Geometry (cs.CG), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + + +@article{mereu2020benchmarking, + doi = {10.1038/s41587-020-0469-4}, + author = {Mereu, Elisabetta and Lafzi, Atefeh and Moutinho, Catia and Ziegenhain, Christoph and McCarthy, Davis J and Alvarez-Varela, Adrian and Batlle, Eduard and Sagar and Gruen, Dominic and Lau, Julia K and others}, + journal = {Nature biotechnology}, + number = {6}, + pages = {747--755}, + publisher = {Nature Publishing Group US New York}, + title = {Benchmarking single-cell {RNA}-sequencing protocols for cell atlas projects}, + volume = {38}, + year = {2020} +} + + +@inbook{miles2005rsquared, + title = {Encyclopedia of Statistics in Behavioral Science}, + author = {Jeremy Miles}, + year = {2005}, + month = {Oct.}, + publisher = {John Wiley {\&} Sons, Ltd}, + doi = {10.1002/0470013192.bsa526}, + url = {https://doi.org/10.1002/0470013192.bsa526}, + chapter = {{R-Squared}, Adjusted {R-Squared}} +} + + +@article{moon2019visualizing, + title = {Visualizing structure and transitions in high-dimensional biological data}, + author = {Kevin R. Moon and David van Dijk and Zheng Wang and Scott Gigante and Daniel B. Burkhardt and William S. Chen and Kristina Yim and Antonia van den Elzen and Matthew J. Hirn and Ronald R. Coifman and Natalia B. Ivanova and Guy Wolf and Smita Krishnaswamy}, + year = {2019}, + month = {Dec.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {12}, + pages = {1482--1492}, + doi = {10.1038/s41587-019-0336-3}, + url = {https://doi.org/10.1038/s41587-019-0336-3} +} + + +@article{narayan2021assessing, + title = {Assessing single-cell transcriptomic variability through density-preserving data visualization}, + author = {Ashwin Narayan and Bonnie Berger and Hyunghoon Cho}, + year = {2021}, + month = {Jan}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {39}, + number = {6}, + pages = {765--774}, + doi = {10.1038/s41587-020-00801-7}, + url = {https://doi.org/10.1038/s41587-020-00801-7} +} + + +@article{nestorowa2016single, + title = {A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation}, + author = {Sonia Nestorowa and Fiona K. Hamey and Blanca Pijuan Sala and Evangelia Diamanti and Mairi Shepherd and Elisa Laurenti and Nicola K. Wilson and David G. Kent and Berthold G\"{o}ttgens}, + year = {2016}, + month = {Aug.}, + journal = {Blood}, + publisher = {American Society of Hematology}, + volume = {128}, + number = {8}, + pages = {e20--e31}, + doi = {10.1182/blood-2016-05-716480}, + url = {https://doi.org/10.1182/blood-2016-05-716480} +} + +@inproceedings{luecken2021neurips, + author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M}, + booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, + editor = {J. Vanschoren and S. Yeung}, + pages = {}, + publisher = {Curran}, + title = {A sandbox for prediction and integration of DNA, RNA, and proteins in single cells}, + url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf}, + volume = {1}, + year = {2021} +} + + +@string{nov = {Nov.}} + + +@string{oct = {Oct.}} + + +@article{olsson2016single, + title = {Single-cell analysis of mixed-lineage states leading to a binary cell fate choice}, + author = {Andre Olsson and Meenakshi Venkatasubramanian and Viren K. Chaudhri and Bruce J. Aronow and Nathan Salomonis and Harinder Singh and H. Leighton Grimes}, + year = {2016}, + month = {Aug.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {537}, + number = {7622}, + pages = {698--702}, + doi = {10.1038/nature19348}, + url = {https://doi.org/10.1038/nature19348} +} + + +@misc{openproblems, + title = {Open Problems}, + author = {{Open Problems for Single Cell Analysis Consortium}}, + year = {2022}, + url = {https://openproblems.bio} +} + + +@article{pearson1895regression, + doi = {10.1098/rspl.1895.0041}, + title = {VII. Note on regression and inheritance in the case of two parents}, + author = {Pearson, Karl}, + journal = {proceedings of the royal society of London}, + volume = {58}, + number = {347-352}, + pages = {240--242}, + year = {1895}, + publisher = {The Royal Society London} +} + + +@article{pearson1901pca, + title = {On lines and planes of closest fit to systems of points in space}, + author = {Karl Pearson}, + year = {1901}, + month = {Nov.}, + journal = {The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science}, + publisher = {Informa {UK} Limited}, + volume = {2}, + number = {11}, + pages = {559--572}, + doi = {10.1080/14786440109462720}, + url = {https://doi.org/10.1080/14786440109462720} +} + + +@article{pliner2019supervised, + title = {Supervised classification enables rapid annotation of cell atlases}, + author = {Hannah A. Pliner and Jay Shendure and Cole Trapnell}, + year = {2019}, + month = {Sept.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {10}, + pages = {983--986}, + doi = {10.1038/s41592-019-0535-3}, + url = {https://doi.org/10.1038/s41592-019-0535-3} +} + + +@article{polanski2020bbknn, + title = {{BBKNN}: fast batch alignment of single cell transcriptomes}, + author = {Krzysztof Pola{\'{n}}ski and Matthew D Young and Zhichao Miao and Kerstin B Meyer and Sarah A Teichmann and Jong-Eun Park}, + year = {2019}, + month = {Aug.}, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + doi = {10.1093/bioinformatics/btz625}, + url = {https://doi.org/10.1093/bioinformatics/btz625}, + editor = {Bonnie Berger} +} + + +@article{raredon2022computation, + title = {Computation and visualization of cell{\textendash}cell signaling topologies in single-cell systems data using Connectome}, + author = {Micha Sam Brickman Raredon and Junchen Yang and James Garritano and Meng Wang and Dan Kushnir and Jonas Christian Schupp and Taylor S. Adams and Allison M. Greaney and Katherine L. Leiby and Naftali Kaminski and Yuval Kluger and Andre Levchenko and Laura E. Niklason}, + year = {2022}, + month = {Mar.}, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {12}, + number = {1}, + doi = {10.1038/s41598-022-07959-x}, + url = {https://doi.org/10.1038/s41598-022-07959-x} +} + + +@article{rodriques2019slide, + title = {Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution}, + author = {Samuel G. Rodriques and Robert R. Stickels and Aleksandrina Goeva and Carly A. Martin and Evan Murray and Charles R. Vanderburg and Joshua Welch and Linlin M. Chen and Fei Chen and Evan Z. Macosko}, + year = {2019}, + month = {Mar.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {363}, + number = {6434}, + pages = {1463--1467}, + doi = {10.1126/science.aaw1219}, + url = {https://doi.org/10.1126/science.aaw1219} +} + + +@InProceedings{santos2009on, + author = {Santos, Jorge M. and Embrechts, Mark"}, + editor = {Alippi, Cesare and Polycarpou, Marios and Panayiotou, Christos and Ellinas, Georgios}, + title = {On the Use of the Adjusted Rand Index as a Metric for Evaluating Supervised Classification}, + booktitle = {Artificial Neural Networks -- ICANN 2009}, + year = {2009}, + publisher = {Springer Berlin Heidelberg}, + address = {Berlin, Heidelberg}, + pages = {175--184}, + isbn = {978-3-642-04277-5}, + doi = {10.1007/978-3-642-04277-5_18}, + url = {https://doi.org/10.1007/978-3-642-04277-5_18} +} + + +@article{sarkar2021separating, + title = {Separating measurement and expression models clarifies confusion in single-cell {RNA} sequencing analysis}, + author = {Abhishek Sarkar and Matthew Stephens}, + year = {2021}, + month = {May}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {6}, + pages = {770--777}, + doi = {10.1038/s41588-021-00873-4}, + url = {https://doi.org/10.1038/s41588-021-00873-4} +} + + +@article{schober2018correlation, + title = {Correlation Coefficients}, + author = {Patrick Schober and Christa Boer and Lothar A. Schwarte}, + year = {2018}, + month = {May}, + journal = {Anesthesia {\&} Analgesia}, + publisher = {Ovid Technologies (Wolters Kluwer Health)}, + volume = {126}, + number = {5}, + pages = {1763--1768}, + doi = {10.1213/ane.0000000000002864}, + url = {https://doi.org/10.1213/ane.0000000000002864} +} + + +@string{sep = {Sept.}} + + +@inproceedings{stanley2020harmonic, + title = {Harmonic Alignment}, + author = {Jay S. Stanley and Scott Gigante and Guy Wolf and Smita Krishnaswamy}, + year = {2020}, + month = {Jan}, + booktitle = {Proceedings of the 2020 {SIAM} International Conference on Data Mining}, + publisher = {Society for Industrial and Applied Mathematics}, + pages = {316--324}, + doi = {10.1137/1.9781611976236.36}, + url = {https://doi.org/10.1137/1.9781611976236.36} +} + + +@article{stoeckius2017simultaneous, + title = {Simultaneous epitope and transcriptome measurement in single cells}, + author = {Marlon Stoeckius and Christoph Hafemeister and William Stephenson and Brian Houck-Loomis and Pratip K Chattopadhyay and Harold Swerdlow and Rahul Satija and Peter Smibert}, + year = {2017}, + month = {Jul.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {14}, + number = {9}, + pages = {865--868}, + doi = {10.1038/nmeth.4380}, + url = {https://doi.org/10.1038/nmeth.4380} +} + + +@article{stuart2019comprehensive, + title = {Comprehensive Integration of Single-Cell Data}, + author = {Stuart, T. and Butler, A. and Hoffman, P. and Hafemeister, C. and Papalexi, E. and Mauck, W.M. and Hao, Y. and Stoeckius, M. and Smibert, P. and Satija, R.}, + year = {2019}, + journal = {Cell}, + volume = {177}, + number = {7}, + pages = {1888--1902.e21}, + doi = {10.1016/j.cell.2019.05.031} +} + + +@article{szubert2019structurepreserving, + title = {Structure-preserving visualisation of high dimensional single-cell datasets}, + author = {Benjamin Szubert and Jennifer E. Cole and Claudia Monaco and Ignat Drozdov}, + year = {2019}, + month = {Jun.}, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {9}, + number = {1}, + doi = {10.1038/s41598-019-45301-0}, + url = {https://doi.org/10.1038/s41598-019-45301-0} +} + + +@article{tabula2018single, + title = {Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris}, + author = {{Tabula Muris Consortium}}, + year = {2018}, + month = {Oct.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {562}, + number = {7727}, + pages = {367--372}, + doi = {10.1038/s41586-018-0590-4}, + url = {https://doi.org/10.1038/s41586-018-0590-4} +} + + +@article{tabula2020single, + title = {A single-cell transcriptomic atlas characterizes ageing tissues in the mouse}, + author = {{Tabula Muris Consortium}}, + year = {2020}, + month = {Jul.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {583}, + number = {7817}, + pages = {590--595}, + doi = {10.1038/s41586-020-2496-1}, + url = {https://doi.org/10.1038/s41586-020-2496-1} +} + + +@article{tasic2016adult, + title = {Adult mouse cortical cell taxonomy revealed by single cell transcriptomics}, + author = {Bosiljka Tasic and Vilas Menon and Thuc Nghi Nguyen and Tae Kyung Kim and Tim Jarsky and Zizhen Yao and Boaz Levi and Lucas T Gray and Staci A Sorensen and Tim Dolbeare and Darren Bertagnolli and Jeff Goldy and Nadiya Shapovalova and Sheana Parry and Changkyu Lee and Kimberly Smith and Amy Bernard and Linda Madisen and Susan M Sunkin and Michael Hawrylycz and Christof Koch and Hongkui Zeng}, + year = {2016}, + month = {Jan}, + journal = {Nature Neuroscience}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {2}, + pages = {335--346}, + doi = {10.1038/nn.4216}, + url = {https://doi.org/10.1038/nn.4216} +} + + +@article{tian2019benchmarking, + title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments}, + author = {Luyi Tian and Xueyi Dong and Saskia Freytag and Kim-Anh L{\^{e}} Cao and Shian Su and Abolfazl JalalAbadi and Daniela Amann-Zalcenstein and Tom S. Weber and Azadeh Seidi and Jafar S. Jabbari and Shalin H. Naik and Matthew E. Ritchie}, + year = {2019}, + month = {May}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {6}, + pages = {479--487}, + doi = {10.1038/s41592-019-0425-8}, + url = {https://doi.org/10.1038/s41592-019-0425-8} +} + + +@article{tran2020benchmark, + doi = {10.1186/s13059-019-1850-9}, + url = {https://doi.org/10.1186/s13059-019-1850-9}, + year = {2020}, + month = {Jan}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + author = {Hoa Thi Nhu Tran and Kok Siong Ang and Marion Chevrier and Xiaomeng Zhang and Nicole Yee Shin Lee and Michelle Goh and Jinmiao Chen}, + title = {A benchmark of batch-effect correction methods for single-cell {RNA} sequencing data}, + journal = {Genome Biology} +} + + +@article{van2018recovering, + title = {Recovering Gene Interactions from Single-Cell Data Using Data Diffusion}, + author = {David van Dijk and Roshan Sharma and Juozas Nainys and Kristina Yim and Pooja Kathail and Ambrose J. Carr and Cassandra Burdziak and Kevin R. Moon and Christine L. Chaffer and Diwakar Pattabiraman and Brian Bierie and Linas Mazutis and Guy Wolf and Smita Krishnaswamy and Dana Pe'er}, + year = {2018}, + month = {Jul.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {174}, + number = {3}, + pages = {716--729.e27}, + doi = {10.1016/j.cell.2018.05.061}, + url = {https://doi.org/10.1016/j.cell.2018.05.061} +} + + +@article{vandermaaten2008visualizing, + title = {Visualizing Data using t-SNE}, + author = {{van der} Maaten, Laurens and Hinton, Geoffrey}, + year = {2008}, + journal = {Journal of Machine Learning Research}, + volume = {9}, + number = {86}, + pages = {2579--2605}, + url = {http://jmlr.org/papers/v9/vandermaaten08a.html} +} + + +@inproceedings{venna2001neighborhood, + title = {Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2001}, + booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, + publisher = {Springer Berlin Heidelberg}, + pages = {485--491}, + doi = {{10.1007/3-540-44668-0\_68}}, + url = {{https://doi.org/10.1007/3-540-44668-0\_68}} +} + + +@article{venna2006local, + title = {Local multidimensional scaling}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2006}, + month = {Jul.}, + journal = {Neural Networks}, + publisher = {Elsevier {BV}}, + volume = {19}, + number = {6-7}, + pages = {889--899}, + doi = {10.1016/j.neunet.2006.05.014}, + url = {https://doi.org/10.1016/j.neunet.2006.05.014} +} + + +@article{virshup2021anndataannotateddata, + doi = {10.1101/2021.12.16.473007}, + url = {https://doi.org/10.1101/2021.12.16.473007}, + year = {2021}, + month = {Dec.}, + publisher = {Cold Spring Harbor Laboratory}, + author = {Isaac Virshup and Sergei Rybakov and Fabian J. Theis and Philipp Angerer and F. Alexander Wolf}, + title = {anndata: Annotated data} +} + + +@article{wagner2018knearest, + title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, + author = {Wagner, Florian and Yan, Yun and Yanai, Itai}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/217737}, + url = {https://www.biorxiv.org/content/early/2018/04/09/217737}, + elocation-id = {217737}, + eprint = {https://www.biorxiv.org/content/early/2018/04/09/217737.full.pdf} +} + + +@article{wagner2018single, + title = {Single-cell mapping of gene expression landscapes and lineage in the zebrafish embryo}, + author = {Daniel E. Wagner and Caleb Weinreb and Zach M. Collins and James A. Briggs and Sean G. Megason and Allon M. Klein}, + year = {2018}, + month = {Jun.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {360}, + number = {6392}, + pages = {981--987}, + doi = {10.1126/science.aar4362}, + url = {https://doi.org/10.1126/science.aar4362} +} + + +@article{wang2013target, + title = {Target analysis by integration of transcriptome and {ChIP}-seq data with {BETA}}, + author = {Su Wang and Hanfei Sun and Jian Ma and Chongzhi Zang and Chenfei Wang and Juan Wang and Qianzi Tang and Clifford A Meyer and Yong Zhang and X Shirley Liu}, + year = {2013}, + month = {Nov.}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {8}, + number = {12}, + pages = {2502--2515}, + doi = {10.1038/nprot.2013.150}, + url = {https://doi.org/10.1038/nprot.2013.150} +} + + +@article{wang2017visualization, + title = {Visualization and analysis of single-cell {RNA}-seq data by kernel-based similarity learning}, + volume = {14}, + copyright = {2017 Springer Nature America, Inc.}, + issn = {1548-7105}, + url = {https://www.nature.com/articles/nmeth.4207}, + doi = {10.1038/nmeth.4207}, + abstract = {The SIMLR software identifies similarities between cells across a range of single-cell RNA-seq data, enabling effective dimension reduction, clustering and visualization.}, + language = {en}, + number = {4}, + journal = {Nature Methods}, + author = {Wang, Bo and Zhu, Junjie and Pierson, Emma and Ramazzotti, Daniele and Batzoglou, Serafim}, + month = apr, + year = {2017}, + publisher = {Nature Publishing Group}, + keywords = {Gene expression, Genome informatics, Machine learning, Statistical methods}, + pages = {414--416}, +} + + +@article{welch2019single, + title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity}, + author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko}, + year = {2019}, + month = {Jun.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {177}, + number = {7}, + pages = {1873--1887.e17}, + doi = {10.1016/j.cell.2019.05.006}, + url = {https://doi.org/10.1016/j.cell.2019.05.006} +} + + +@article{wilkinson1973symbolic, + doi = {10.2307/2346786}, + url = {https://doi.org/10.2307/2346786}, + year = {1973}, + publisher = {{JSTOR}}, + volume = {22}, + number = {3}, + pages = {392}, + author = {G. N. Wilkinson and C. E. Rogers}, + title = {Symbolic Description of Factorial Models for Analysis of Variance}, + journal = {Applied Statistics} +} + + +@article{wu2021single, + title = {A single-cell and spatially resolved atlas of human breast cancers}, + author = {Sunny Z. Wu and Ghamdan Al-Eryani and Daniel Lee Roden and Simon Junankar and Kate Harvey and Alma Andersson and Aatish Thennavan and Chenfei Wang and James R. Torpy and Nenad Bartonicek and Taopeng Wang and Ludvig Larsson and Dominik Kaczorowski and Neil I. Weisenfeld and Cedric R. Uytingco and Jennifer G. Chew and Zachary W. Bent and Chia-Ling Chan and Vikkitharan Gnanasambandapillai and Charles-Antoine Dutertre and Laurence Gluch and Mun N. Hui and Jane Beith and Andrew Parker and Elizabeth Robbins and Davendra Segara and Caroline Cooper and Cindy Mak and Belinda Chan and Sanjay Warrier and Florent Ginhoux and Ewan Millar and Joseph E. Powell and Stephen R. Williams and X. Shirley Liu and Sandra O'Toole and Elgene Lim and Joakim Lundeberg and Charles M. Perou and Alexander Swarbrick}, + year = {2021}, + month = {Sept.}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {9}, + pages = {1334--1347}, + doi = {10.1038/s41588-021-00911-1}, + url = {https://doi.org/10.1038/s41588-021-00911-1} +} + + +@article{xiong2020neuralee, + title = {{NeuralEE}: A {GPU}-Accelerated Elastic Embedding Dimensionality Reduction Method for Visualizing Large-Scale {scRNA}-Seq Data}, + author = {Jiankang Xiong and Fuzhou Gong and Lin Wan and Liang Ma}, + year = {2020}, + month = {Oct.}, + journal = {Frontiers in Genetics}, + publisher = {Frontiers Media {SA}}, + volume = {11}, + doi = {10.3389/fgene.2020.00786}, + url = {https://doi.org/10.3389/fgene.2020.00786} +} + + +@article{xiong2021online, + title = {Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space}, + author = {Lei Xiong and Kang Tian and Yuzhe Li and Weixi Ning and Xin Gao and Qiangfeng Cliff Zhang}, + year = {2022}, + month = {Oct.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-33758-z}, + url = {https://doi.org/10.1038/s41467-022-33758-z} +} + + +@article{xu2021probabilistic, + title = {Probabilistic harmonization and annotation of single-cell transcriptomics data with deep generative models}, + author = {Chenling Xu and Romain Lopez and Edouard Mehlman and Jeffrey Regier and Michael I Jordan and Nir Yosef}, + year = {2021}, + month = {Jan}, + journal = {Molecular Systems Biology}, + publisher = {{Embo}}, + volume = {17}, + number = {1}, + doi = {10.15252/msb.20209620}, + url = {https://doi.org/10.15252/msb.20209620} +} + + +@article{zappia2018exploring, + doi = {10.1371/journal.pcbi.1006245}, + url = {https://doi.org/10.1371/journal.pcbi.1006245}, + year = {2018}, + month = {Jun.}, + publisher = {Public Library of Science ({PLoS})}, + volume = {14}, + number = {6}, + pages = {e1006245}, + author = {Luke Zappia and Belinda Phipson and Alicia Oshlack}, + editor = {Dina Schneidman}, + title = {Exploring the single-cell {RNA}-seq analysis landscape with the {scRNA}-tools database}, + journal = {{PLOS} Computational Biology} +} + + +@article{zhang2021pydrmetrics, + title = {{pyDRMetrics} - A Python toolkit for dimensionality reduction quality assessment}, + author = {Yinsheng Zhang and Qian Shang and Guoming Zhang}, + year = {2021}, + month = {Feb.}, + journal = {Heliyon}, + publisher = {Elsevier {BV}}, + volume = {7}, + number = {2}, + pages = {e06199}, + doi = {10.1016/j.heliyon.2021.e06199}, + url = {https://doi.org/10.1016/j.heliyon.2021.e06199} +} + +@article {hrovatin2023delineating, + author = {Karin Hrovatin and Aim{\'e}e Bastidas-Ponce and Mostafa Bakhti and Luke Zappia and Maren B{\"u}ttner and Ciro Sallino and Michael Sterr and Anika B{\"o}ttcher and Adriana Migliorini and Heiko Lickert and Fabian J. Theis}, + title = {Delineating mouse β-cell identity during lifetime and in diabetes with a single cell atlas}, + elocation-id = {2022.12.22.521557}, + year = {2023}, + doi = {10.1101/2022.12.22.521557}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557}, + eprint = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557.full.pdf}, + journal = {bioRxiv} +} + +@article{sikkema2023integrated, + title = {An integrated cell atlas of the lung in health and disease}, + volume = {29}, + ISSN = {1546-170X}, + url = {http://dx.doi.org/10.1038/s41591-023-02327-2}, + DOI = {10.1038/s41591-023-02327-2}, + number = {6}, + journal = {Nature Medicine}, + publisher = {Springer Science and Business Media LLC}, + author = {Sikkema, Lisa and Ramírez-Suástegui, Ciro and Strobl, Daniel C. and Gillett, Tessa E. and Zappia, Luke and Madissoon, Elo and Markov, Nikolay S. and Zaragosi, Laure-Emmanuelle and Ji, Yuge and Ansari, Meshal and Arguel, Marie-Jeanne and Apperloo, Leonie and Banchero, Martin and Bécavin, Christophe and Berg, Marijn and Chichelnitskiy, Evgeny and Chung, Mei-i and Collin, Antoine and Gay, Aurore C. A. and Gote-Schniering, Janine and Hooshiar Kashani, Baharak and Inecik, Kemal and Jain, Manu and Kapellos, Theodore S. and Kole, Tessa M. and Leroy, Sylvie and Mayr, Christoph H. and Oliver, Amanda J. and von Papen, Michael and Peter, Lance and Taylor, Chase J. and Walzthoeni, Thomas and Xu, Chuan and Bui, Linh T. and De Donno, Carlo and Dony, Leander and Faiz, Alen and Guo, Minzhe and Gutierrez, Austin J. and Heumos, Lukas and Huang, Ni and Ibarra, Ignacio L. and Jackson, Nathan D. and Kadur Lakshminarasimha Murthy, Preetish and Lotfollahi, Mohammad and Tabib, Tracy and Talavera-López, Carlos and Travaglini, Kyle J. and Wilbrey-Clark, Anna and Worlock, Kaylee B. and Yoshida, Masahiro and Chen, Yuexin and Hagood, James S. and Agami, Ahmed and Horvath, Peter and Lundeberg, Joakim and Marquette, Charles-Hugo and Pryhuber, Gloria and Samakovlis, Chistos and Sun, Xin and Ware, Lorraine B. and Zhang, Kun and van den Berge, Maarten and Bossé, Yohan and Desai, Tushar J. and Eickelberg, Oliver and Kaminski, Naftali and Krasnow, Mark A. and Lafyatis, Robert and Nikolic, Marko Z. and Powell, Joseph E. and Rajagopal, Jayaraj and Rojas, Mauricio and Rozenblatt-Rosen, Orit and Seibold, Max A. and Sheppard, Dean and Shepherd, Douglas P. and Sin, Don D. and Timens, Wim and Tsankov, Alexander M. and Whitsett, Jeffrey and Xu, Yan and Banovich, Nicholas E. and Barbry, Pascal and Duong, Thu Elizabeth and Falk, Christine S. and Meyer, Kerstin B. and Kropski, Jonathan A. and Pe’er, Dana and Schiller, Herbert B. and Tata, Purushothama Rao and Schultze, Joachim L. and Teichmann, Sara A. and Misharin, Alexander V. and Nawijn, Martijn C. and Luecken, Malte D. and Theis, Fabian J.}, + year = {2023}, + month = jun, + pages = {1563–1577} +} + +@article{consortium2022tabula, + title = {The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans}, + volume = {376}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl4896}, + DOI = {10.1126/science.abl4896}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Jones, Robert C. and Karkanias, Jim and Krasnow, Mark A. and Pisco, Angela Oliveira and Quake, Stephen R. and Salzman, Julia and Yosef, Nir and Bulthaup, Bryan and Brown, Phillip and Harper, William and Hemenez, Marisa and Ponnusamy, Ravikumar and Salehi, Ahmad and Sanagavarapu, Bhavani A. and Spallino, Eileen and Aaron, Ksenia A. and Concepcion, Waldo and Gardner, James M. and Kelly, Burnett and Neidlinger, Nikole and Wang, Zifa and Crasta, Sheela and Kolluru, Saroja and Morri, Maurizio and Pisco, Angela Oliveira and Tan, Serena Y. and Travaglini, Kyle J. and Xu, Chenling and Alcántara-Hernández, Marcela and Almanzar, Nicole and Antony, Jane and Beyersdorf, Benjamin and Burhan, Deviana and Calcuttawala, Kruti and Carter, Matthew M. and Chan, Charles K. F. and Chang, Charles A. and Chang, Stephen and Colville, Alex and Crasta, Sheela and Culver, Rebecca N. and Cvijović, Ivana and D’Amato, Gaetano and Ezran, Camille and Galdos, Francisco X. and Gillich, Astrid and Goodyer, William R. and Hang, Yan and Hayashi, Alyssa and Houshdaran, Sahar and Huang, Xianxi and Irwin, Juan C. and Jang, SoRi and Juanico, Julia Vallve and Kershner, Aaron M. and Kim, Soochi and Kiss, Bernhard and Kolluru, Saroja and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Li, Baoxiang and Loeb, Gabriel B. and Lu, Wan-Jin and Mantri, Sruthi and Markovic, Maxim and McAlpine, Patrick L. and de Morree, Antoine and Morri, Maurizio and Mrouj, Karim and Mukherjee, Shravani and Muser, Tyler and Neuh\"{o}fer, Patrick and Nguyen, Thi D. and Perez, Kimberly and Phansalkar, Ragini and Pisco, Angela Oliveira and Puluca, Nazan and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Schaum, Nicholas and Scott, Bronwyn and Seddighzadeh, Bobak and Segal, Joe and Sen, Sushmita and Sikandar, Shaheen and Spencer, Sean P. and Steffes, Lea C. and Subramaniam, Varun R. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Van Treuren, Will and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Vo, Kim Chi and Vorperian, Sevahn K. and Wang, Wanxin and Weinstein, Hannah N. W. and Winkler, Juliane and Wu, Timothy T. H. and Xie, Jamie and Yung, Andrea R. and Zhang, Yue and Detweiler, Angela M. and Mekonen, Honey and Neff, Norma F. and Sit, Rene V. and Tan, Michelle and Yan, Jia and Bean, Gregory R. and Charu, Vivek and Forgó, Erna and Martin, Brock A. and Ozawa, Michael G. and Silva, Oscar and Tan, Serena Y. and Toland, Angus and Vemuri, Venkata N. P. and Afik, Shaked and Awayan, Kyle and Botvinnik, Olga Borisovna and Byrne, Ashley and Chen, Michelle and Dehghannasiri, Roozbeh and Detweiler, Angela M. and Gayoso, Adam and Granados, Alejandro A. and Li, Qiqing and Mahmoudabadi, Gita and McGeever, Aaron and de Morree, Antoine and Olivieri, Julia Eve and Park, Madeline and Pisco, Angela Oliveira and Ravikumar, Neha and Salzman, Julia and Stanley, Geoff and Swift, Michael and Tan, Michelle and Tan, Weilun and Tarashansky, Alexander J. and Vanheusden, Rohan and Vorperian, Sevahn K. and Wang, Peter and Wang, Sheng and Xing, Galen and Xu, Chenling and Yosef, Nir and Alcántara-Hernández, Marcela and Antony, Jane and Chan, Charles K. F. and Chang, Charles A. and Colville, Alex and Crasta, Sheela and Culver, Rebecca and Dethlefsen, Les and Ezran, Camille and Gillich, Astrid and Hang, Yan and Ho, Po-Yi and Irwin, Juan C. and Jang, SoRi and Kershner, Aaron M. and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Liu, Shixuan and Loeb, Gabriel B. and Lu, Wan-Jin and Maltzman, Jonathan S. and Metzger, Ross J. and de Morree, Antoine and Neuh\"{o}fer, Patrick and Perez, Kimberly and Phansalkar, Ragini and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Sasagawa, Koki and Scott, Bronwyn and Sinha, Rahul and Song, Hanbing and Spencer, Sean P. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Wang, Bruce and Wang, Wanxin and Winkler, Juliane and Xie, Jamie and Yung, Andrea R. and Artandi, Steven E. and Beachy, Philip A. and Clarke, Michael F. and Giudice, Linda C. and Huang, Franklin W. and Huang, Kerwyn Casey and Idoyaga, Juliana and Kim, Seung K. and Krasnow, Mark and Kuo, Christin S. and Nguyen, Patricia and Quake, Stephen R. and Rando, Thomas A. and Red-Horse, Kristy and Reiter, Jeremy and Relman, David A. and Sonnenburg, Justin L. and Wang, Bruce and Wu, Albert and Wu, Sean M. and Wyss-Coray, Tony}, + year = {2022}, + month = may +} + +@article{dominguez2022crosstissue, + title = {Cross-tissue immune cell analysis reveals tissue-specific features in humans}, + volume = {376}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl5197}, + DOI = {10.1126/science.abl5197}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Domínguez Conde, C. and Xu, C. and Jarvis, L. B. and Rainbow, D. B. and Wells, S. B. and Gomes, T. and Howlett, S. K. and Suchanek, O. and Polanski, K. and King, H. W. and Mamanova, L. and Huang, N. and Szabo, P. A. and Richardson, L. and Bolt, L. and Fasouli, E. S. and Mahbubani, K. T. and Prete, M. and Tuck, L. and Richoz, N. and Tuong, Z. K. and Campos, L. and Mousa, H. S. and Needham, E. J. and Pritchard, S. and Li, T. and Elmentaite, R. and Park, J. and Rahmani, E. and Chen, D. and Menon, D. K. and Bayraktar, O. A. and James, L. K. and Meyer, K. B. and Yosef, N. and Clatworthy, M. R. and Sims, P. A. and Farber, D. L. and Saeb-Parsy, K. and Jones, J. L. and Teichmann, S. A.}, + year = {2022}, + month = may +} + +@article{eraslan2022singlenucleus, + title = {Single-nucleus cross-tissue molecular reference maps toward understanding disease gene function}, + volume = {376}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl4290}, + DOI = {10.1126/science.abl4290}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Eraslan, G\"{o}kcen and Drokhlyansky, Eugene and Anand, Shankara and Fiskin, Evgenij and Subramanian, Ayshwarya and Slyper, Michal and Wang, Jiali and Van Wittenberghe, Nicholas and Rouhana, John M. and Waldman, Julia and Ashenberg, Orr and Lek, Monkol and Dionne, Danielle and Win, Thet Su and Cuoco, Michael S. and Kuksenko, Olena and Tsankov, Alexander M. and Branton, Philip A. and Marshall, Jamie L. and Greka, Anna and Getz, Gad and Segrè, Ayellet V. and Aguet, Fran\c{c}ois and Rozenblatt-Rosen, Orit and Ardlie, Kristin G. and Regev, Aviv}, + year = {2022}, + month = may +} + +@article{li2023integrated, + title = {Integrated multi-omics single cell atlas of the human retina}, + url = {http://dx.doi.org/10.1101/2023.11.07.566105}, + DOI = {10.1101/2023.11.07.566105}, + publisher = {Cold Spring Harbor Laboratory}, + author = {Li, Jin and Wang, Jun and Ibarra, Ignacio L and Cheng, Xuesen and Luecken, Malte D and Lu, Jiaxiong and Monavarfeshani, Aboozar and Yan, Wenjun and Zheng, Yiqiao and Zuo, Zhen and Zayas Colborn, Samantha Lynn and Cortez, Berenice Sarahi and Owen, Leah A and Tran, Nicholas M and Shekhar, Karthik and Sanes, Joshua R and Stout, J Timothy and Chen, Shiming and Li, Yumei and DeAngelis, Margaret M and Theis, Fabian J and Chen, Rui}, + year = {2023}, + month = nov +} + +@article{wilson2022multimodal, + title = {Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression}, + volume = {13}, + ISSN = {2041-1723}, + url = {http://dx.doi.org/10.1038/s41467-022-32972-z}, + DOI = {10.1038/s41467-022-32972-z}, + number = {1}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media LLC}, + author = {Wilson, Parker C. and Muto, Yoshiharu and Wu, Haojia and Karihaloo, Anil and Waikar, Sushrut S. and Humphreys, Benjamin D.}, + year = {2022}, + month = sep +} + +@article{steuernagel2022hypomap, + title = {HypoMap—a unified single-cell gene expression atlas of the murine hypothalamus}, + volume = {4}, + ISSN = {2522-5812}, + url = {http://dx.doi.org/10.1038/s42255-022-00657-y}, + DOI = {10.1038/s42255-022-00657-y}, + number = {10}, + journal = {Nature Metabolism}, + publisher = {Springer Science and Business Media LLC}, + author = {Steuernagel, Lukas and Lam, Brian Y. H. and Klemm, Paul and Dowsett, Georgina K. C. and Bauder, Corinna A. and Tadross, John A. and Hitschfeld, Tamara Sotelo and del Rio Martin, Almudena and Chen, Weiyi and de Solis, Alain J. and Fenselau, Henning and Davidsen, Peter and Cimino, Irene and Kohnke, Sara N. and Rimmington, Debra and Coll, Anthony P. and Beyer, Andreas and Yeo, Giles S. H. and Br\"{u}ning, Jens C.}, + year = {2022}, + month = oct, + pages = {1402–1419} +} + +@article{tian2023singlecell, + title = {Single-cell DNA methylation and 3D genome architecture in the human brain}, + volume = {382}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.adf5357}, + DOI = {10.1126/science.adf5357}, + number = {6667}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Tian, Wei and Zhou, Jingtian and Bartlett, Anna and Zeng, Qiurui and Liu, Hanqing and Castanon, Rosa G. and Kenworthy, Mia and Altshul, Jordan and Valadon, Cynthia and Aldridge, Andrew and Nery, Joseph R. and Chen, Huaming and Xu, Jiaying and Johnson, Nicholas D. and Lucero, Jacinta and Osteen, Julia K. and Emerson, Nora and Rink, Jon and Lee, Jasper and Li, Yang E. and Siletti, Kimberly and Liem, Michelle and Claffey, Naomi and O’Connor, Carolyn and Yanny, Anna Marie and Nyhus, Julie and Dee, Nick and Casper, Tamara and Shapovalova, Nadiya and Hirschstein, Daniel and Ding, Song-Lin and Hodge, Rebecca and Levi, Boaz P. and Keene, C. Dirk and Linnarsson, Sten and Lein, Ed and Ren, Bing and Behrens, M. Margarita and Ecker, Joseph R.}, + year = {2023}, + month = oct +} + + +@article{sonrel2023metaanalysis, + title = {Meta-analysis of (single-cell method) benchmarks reveals the need for extensibility and interoperability}, + volume = {24}, + ISSN = {1474-760X}, + url = {http://dx.doi.org/10.1186/s13059-023-02962-5}, + DOI = {10.1186/s13059-023-02962-5}, + number = {1}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media LLC}, + author = {Sonrel, Anthony and Luetge, Almut and Soneson, Charlotte and Mallona, Izaskun and Germain, Pierre-Luc and Knyazev, Sergey and Gilis, Jeroen and Gerber, Reto and Seurinck, Ruth and Paul, Dominique and Sonder, Emanuel and Crowell, Helena L. and Fanaswala, Imran and Al-Ajami, Ahmad and Heidari, Elyas and Schmeing, Stephan and Milosavljevic, Stefan and Saeys, Yvan and Mangul, Serghei and Robinson, Mark D.}, + year = {2023}, + month = may +} + + +@article{saelens2019comparison, + title = {A comparison of single-cell trajectory inference methods}, + volume = {37}, + ISSN = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-019-0071-9}, + DOI = {10.1038/s41587-019-0071-9}, + number = {5}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Saelens, Wouter and Cannoodt, Robrecht and Todorov, Helena and Saeys, Yvan}, + year = {2019}, + month = apr, + pages = {547–554} +} + + +@article{huang2018savergene, + title = {SAVER: gene expression recovery for single-cell RNA sequencing}, + volume = {15}, + ISSN = {1548-7105}, + url = {http://dx.doi.org/10.1038/s41592-018-0033-z}, + DOI = {10.1038/s41592-018-0033-z}, + number = {7}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media LLC}, + author = {Huang, Mo and Wang, Jingshu and Torre, Eduardo and Dueck, Hannah and Shaffer, Sydney and Bonasio, Roberto and Murray, John I. and Raj, Arjun and Li, Mingyao and Zhang, Nancy R.}, + year = {2018}, + month = jun, + pages = {539–542} +} + + +@article{chari2023speciousart, + title = {The specious art of single-cell genomics}, + volume = {19}, + ISSN = {1553-7358}, + url = {http://dx.doi.org/10.1371/journal.pcbi.1011288}, + DOI = {10.1371/journal.pcbi.1011288}, + number = {8}, + journal = {PLOS Computational Biology}, + publisher = {Public Library of Science (PLoS)}, + author = {Chari, Tara and Pachter, Lior}, + editor = {Papin, Jason A.}, + year = {2023}, + month = aug, + pages = {e1011288} +} diff --git a/common/nextflow_helpers/helper.nf b/common/nextflow_helpers/helper.nf new file mode 100644 index 0000000..e05fc9d --- /dev/null +++ b/common/nextflow_helpers/helper.nf @@ -0,0 +1,31 @@ +Map findArgumentSchema(Map config, String argument_id) { + def argument_groups = + (config.argument_groups ?: []) + + [ + arguments: config.arguments ?: [] + ] + + def schema_value = argument_groups.findResult{ gr -> + gr.arguments.find { arg -> + arg.name == ("--" + argument_id) + } + } + return schema_value +} + +Boolean checkItemAllowed(String item, List include, List exclude, String includeArgName, String excludeArgName) { + + // Throw an error if both include and exclude lists are provided + if (include != null && exclude != null) { + throw new Exception("Cannot define both ${includeArgName} and ${excludeArgName}") + } + + if (include) { + return include.contains(item) + } + if (exclude) { + return !exclude.contains(item) + } + + return true +} diff --git a/common/nextflow_helpers/labels_ci.config b/common/nextflow_helpers/labels_ci.config new file mode 100644 index 0000000..5161976 --- /dev/null +++ b/common/nextflow_helpers/labels_ci.config @@ -0,0 +1,11 @@ +process { + withLabel: lowmem { memory = 5.Gb } + withLabel: lowcpu { cpus = 2 } + withLabel: midmem { memory = 5.Gb } + withLabel: midcpu { cpus = 2 } + withLabel: highmem { memory = 5.Gb } + withLabel: highcpu { cpus = 2 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } +} diff --git a/common/nextflow_helpers/labels_tw.config b/common/nextflow_helpers/labels_tw.config new file mode 100644 index 0000000..1f71793 --- /dev/null +++ b/common/nextflow_helpers/labels_tw.config @@ -0,0 +1,93 @@ +process { + executor = 'awsbatch' + + // Default disk space + disk = 50.GB + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.attempt < 3 && task.exitStatus in (137) ? 'retry' : 'ignore' } + maxRetries = 3 + maxMemory = null + + // Resource labels + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowmem { + memory = { get_memory( 20.GB * task.attempt ) } + disk = { 50.GB * task.attempt } + } + withLabel: midmem { + memory = { get_memory( 50.GB * task.attempt ) } + disk = { 100.GB * task.attempt } + } + withLabel: highmem { + memory = { get_memory( 100.GB * task.attempt ) } + disk = { 200.GB * task.attempt } + } + withLabel: veryhighmem { + memory = { get_memory( 200.GB * task.attempt ) } + disk = { 400.GB * task.attempt } + } + withLabel: lowsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""} + } + withLabel: midsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""} + } + withLabel: highsharedmem { + containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""} + } + withLabel: gpu { + cpus = 16 + accelerator = 1 + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: midgpu { + cpus = 32 + accelerator = 4 + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + withLabel: highgpu { + cpus = 64 + accelerator = 8 + containerOptions = { workflow.containerEngine == "singularity" ? '--nv': + ( workflow.containerEngine == "docker" ? '--gpus all': null ) } + } + + // make sure publishstates gets enough disk space and memory + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } +} + +// set tracing file +trace { + enabled = true + overwrite = true + file = "${params.publish_dir}/trace.txt" +} diff --git a/common/schemas/api_component_spec.yaml b/common/schemas/api_component_spec.yaml new file mode 100644 index 0000000..212671c --- /dev/null +++ b/common/schemas/api_component_spec.yaml @@ -0,0 +1,83 @@ +$schema: "http://json-schema.org/draft-07/schema#" +title: Component specification +description: | + A component type specification file for defining the task API. +type: object +required: [info] +additionalProperties: false +properties: + namespace: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Namespace" + info: + type: object + description: Metadata of the component. + required: [type, type_info] + properties: + type: + "$ref": "schema_openproblems_definitions.yaml#/definitions/ComponentType" + subtype: + "$ref": "schema_openproblems_definitions.yaml#/definitions/ComponentSubtype" + type_info: + type: object + description: Metadata related to the component type. + required: [label, summary, description] + properties: + label: + $ref: "schema_openproblems_definitions.yaml#/definitions/Label" + summary: + $ref: "schema_openproblems_definitions.yaml#/definitions/Summary" + description: + $ref: "schema_openproblems_definitions.yaml#/definitions/Description" + arguments: + type: array + description: Component-specific parameters. + items: + anyOf: + - $ref: 'schema_openproblems_definitions.yaml#/definitions/ComponentAPIFile' + - $ref: 'schema_viash.yaml#/definitions/Argument' + argument_groups: + type: array + description: Component-specific parameter groups. + items: + description: "A grouping of the arguments, used to display the help message." + type: "object" + properties: + name: + description: "The name of the argument group." + type: "string" + description: + description: "A description of the argument group. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + label: + description: "A clean version of the argument group's name. This is only used\ + \ for documentation." + type: "string" + summary: + description: "A one-sentence summary of the argument group. This is only used\ + \ for documentation." + type: "string" + arguments: + description: "A list of arguments for this component. For each argument, a\ + \ type and a name must be specified. Depending on the type of argument,\ + \ different properties can be set. See these reference pages per type for\ + \ more information: \n\n - string\n - file\n - integer\n - double\n - boolean\n\ + \ - boolean_true\n - boolean_false\n" + type: "array" + items: + anyOf: + - $ref: 'schema_openproblems_definitions.yaml#/definitions/ComponentAPIFile' + - $ref: 'schema_viash.yaml#/definitions/Argument' + required: + - "name" + additionalProperties: false + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" diff --git a/common/schemas/api_file_format.yaml b/common/schemas/api_file_format.yaml new file mode 100644 index 0000000..26cce7d --- /dev/null +++ b/common/schemas/api_file_format.yaml @@ -0,0 +1,31 @@ +$schema: "http://json-schema.org/draft-07/schema#" +title: File format +description: A file format specification file for defining the task API. +type: object +additionalProperties: false +required: [type, label, summary] +properties: + type: + const: file + label: + $ref: "schema_openproblems_definitions.yaml#/definitions/Label" + summary: + $ref: "schema_openproblems_definitions.yaml#/definitions/Summary" + description: + $ref: "schema_openproblems_definitions.yaml#/definitions/Description" + example: + description: A file in the `resources_test` folder which is an example of this file format. + type: string + __merge__: + $ref: "schema_openproblems_definitions.yaml#/definitions/Merge" + info: + description: 'Structured information. Can be any shape: a string, vector, map or even nested map.' + type: object + properties: + format: + oneOf: + - $ref: "schema_openproblems_definitions.yaml#/definitions/H5ADFormat" + - $ref: "schema_openproblems_definitions.yaml#/definitions/CSVFormat" + - $ref: "schema_openproblems_definitions.yaml#/definitions/TSVFormat" + - $ref: "schema_openproblems_definitions.yaml#/definitions/ParquetFormat" + - $ref: "schema_openproblems_definitions.yaml#/definitions/SpatialDataZarrFormat" diff --git a/common/schemas/schema_openproblems_definitions.yaml b/common/schemas/schema_openproblems_definitions.yaml new file mode 100644 index 0000000..46e861a --- /dev/null +++ b/common/schemas/schema_openproblems_definitions.yaml @@ -0,0 +1,439 @@ +$schema: "http://json-schema.org/draft-07/schema#" +definitions: + NextflowRunner: + title: Nextflow Runner + description: Run a Viash component on a Nextflow backend engine. + properties: + type: + description: "Run a Viash component on a Nextflow backend engine.\n" + const: "nextflow" + directives: + $ref: 'schema_viash.yaml#/definitions/NextflowDirectives' + required: [ type ] + additionalProperties: false + ExecutableRunner: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner is\ + \ also used for the native engine.\n\nThis runner is also used for the docker\ + \ engine.\n" + type: "object" + properties: + docker_run_args: + oneOf: + - description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + - type: "array" + items: + description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + type: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner\ + \ is also used for the native engine.\n\nThis runner is also used for the\ + \ docker engine.\n" + const: "executable" + required: + - "type" + additionalProperties: false + DockerEngine: + title: Docker Engine + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a docker\ + \ container from scratch using the setup flag, or pull it from a docker repository.\n" + type: object + properties: + type: + const: docker + description: Run a Viash component on a Docker backend platform. + image: + type: string + description: The base container to start from. You can also add the tag here + if you wish. + run_args: + anyOf: + - type: string + description: Add docker run arguments. + - type: array + items: + type: string + description: Add docker run arguments. + setup: + type: array + items: + "$ref": "schema_viash.yaml#/definitions/Requirements" + test_setup: + type: array + items: + "$ref": "schema_viash.yaml#/definitions/Requirements" + required: [type, image] + additionalProperties: false + PreferredNormalization: + enum: [l1_sqrt, log_cpm, log_cp10k, log_scran_pooling, sqrt_cpm, sqrt_cp10k, counts] + description: | + Which normalization method a component prefers. + + Each value corresponds to a normalization component in the directory `src/datasets/normalization`. + ComponentSubtype: + type: string + description: | + A component subtype, in case the task has multiple subtypes of methods and metrics. + ComponentType: + type: string + description: | + A component subtype, in case the task has multiple subtypes of methods and metrics. + Name: + type: string + description: | + A unique identifier. Can only contain lowercase letters, numbers or underscores. + pattern: "^[a-z_][a-z0-9_]*$" + maxLength: 50 + Namespace: + type: string + description: | + The namespace a component is part of. + pattern: "^[a-z_][a-z0-9_/]*$" + Label: + type: string + description: | + A unique, human-readable, short label. Used for creating summary tables and visualisations. + maxLength: 50 + Image: + type: string + description: | + The name of the image file to use for the component on the website. + Summary: + type: string + description: | + A one sentence summary of purpose and methodology. Used for creating an overview tables. + minLength: 15 + maxLength: 180 + Description: + type: string + description: | + A longer description (one or more paragraphs). Used for creating reference documentation and supplementary information. + minLength: 30 + BibtexReference: + type: string + description: | + A bibtex reference key to the paper where the component is described. + DocumentationURL: + type: string + format: uri + pattern: "^https://" + description: The url to the documentation of the used software library. + RepositoryURL: + type: string + format: uri + pattern: "^https://" + description: The url to the repository of the used software library. + MethodVariants: + type: object + description: Alternative parameter sets which should be evaluated in the benchmark. + properties: + preferred_normalization: + "$ref": "#/definitions/PreferredNormalization" + CompAPIMerge: + type: string + description: | + The API specifies which type of component this is. + It contains specifications for: + + - The input/output files + - Common parameters + - A unit test + Merge: + type: string + description: | + Another YAML to inherit values from. + ComponentAPIFile: + description: A `file` type argument has a string value that points to a file or folder path. + type: object + properties: + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f` or `foo`. The number of dashes determines how values can be passed: \n\n - `--foo` is a long option, which can be passed with `executable_name --foo=value` or `executable_name --foo value`\n - `-f` is a short option, which can be passed with `executable_name -f value`\n - `foo` is an argument, which can be passed with `executable_name value` \n" + type: string + __merge__: + type: string + description: The file format specification file. + direction: + description: Makes this argument an `input` or an `output`, as in does the file/folder needs to be read or written. `input` by default. + $ref: 'schema_viash.yaml#/definitions/Direction' + info: + description: 'Structured information. Can be any shape: a string, vector, map or even nested map.' + type: object + required: + description: Make the value for this argument required. If set to `true`, an error will be produced if no value was provided. `false` by default. + type: boolean + required: [name, __merge__, direction, required] + #additionalProperties: false + AnnDataObject: + properties: + X: + $ref: "#/definitions/DataTypeSpec" + layers: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + var: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + varm: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + varp: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + obs: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + obsm: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + obsp: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + uns: + type: array + items: + anyOf: + - $ref: "#/definitions/DataTypeSpec" + - $ref: "#/definitions/DataTypeObject" + H5ADFormat: + type: object + properties: + type: + const: h5ad + description: The file format. + X: + type: object + layers: + type: array + obs: + type: array + obsm: + type: array + obsp: + type: array + var: + type: array + varm: + type: array + varp: + type: array + uns: + type: array + required: [type] + additionalProperties: false + allOf: + - $ref: "#/definitions/AnnDataObject" + CSVFormat: + type: object + required: [type] + additionalProperties: false + properties: + type: + const: csv + description: The file format. + columns: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + TSVFormat: + type: object + required: [type] + additionalProperties: false + properties: + type: + const: tsv + description: The file format. + columns: + type: array + items: + $ref: "#/definitions/DataTypeSpec" + ParquetFormat: + type: object + required: [type] + additionalProperties: false + properties: + type: + const: parquet + description: The file format. + columns: + type: array + items: + $ref: "#/definitions/MoreDataTypeSpec" + SpatialDataZarrFormat: + type: object + required: [type] + additionalProperties: false + properties: + type: + const: spatialdata_zarr + description: The file format. + images: + type: array + items: + $ref: "#/definitions/DataTypeObject" + labels: + type: array + items: + $ref: "#/definitions/DataTypeObject" + points: + type: array + items: + type: object + properties: + type: + const: dataframe + name: + type: string + description: + type: string + required: + type: boolean + columns: + type: array + items: + $ref: "#/definitions/MoreDataTypeSpec" + required: [type, name, description, required] + additionalProperties: false + shapes: + type: array + items: + type: object + properties: + type: + const: dataframe + name: + type: string + description: + type: string + required: + type: boolean + columns: + type: array + items: + $ref: "#/definitions/MoreDataTypeSpec" + required: [type, name, description, required] + additionalProperties: false + tables: + type: array + items: + type: object + properties: + name: + type: string + type: + const: anndata + description: + type: string + required: + type: boolean + X: + type: object + layers: + type: array + obs: + type: array + obsm: + type: array + obsp: + type: array + var: + type: array + varm: + type: array + varp: + type: array + uns: + type: array + required: [type, name, description, required] + additionalProperties: false + allOf: + - $ref: "#/definitions/AnnDataObject" + coordinate_systems: + type: array + items: + $ref: "#/definitions/DataTypeObject" + DataTypeSpec: + properties: + type: + enum: [integer, double, string, boolean] + name: + type: string + description: A unique identifier. + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + description: + type: string + required: + type: boolean + required: [type, name, description, required] + MoreDataTypeSpec: + properties: + type: + enum: [integer, long, float, double, string, categorical, boolean, object] + name: + type: string + description: A unique identifier. + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + description: + type: string + required: + type: boolean + required: [type, name, description, required] + DataTypeObject: + properties: + type: + const: object + name: + type: string + description: A unique identifier. + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + description: + type: string + required: + type: boolean + required: [type, name, description, required] + Author: + description: Author metadata. + type: object + additionalProperties: false + properties: + name: + description: Full name of the author, usually in the name of FirstName MiddleName LastName. + type: string + info: + description: Additional information on the author + type: object + additionalProperties: false + properties: + github: + type: string + orcid: + type: string + email: + type: string + twitter: + type: string + linkedin: + type: string + roles: + description: | + Role of the author. Possible values: + + * `"author"`: Authors who have made substantial contributions to the component. + * `"maintainer"`: The maintainer of the component. + * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). + type: array + items: + enum: [maintainer, author, contributor] diff --git a/common/schemas/schema_viash.yaml b/common/schemas/schema_viash.yaml new file mode 100644 index 0000000..2917273 --- /dev/null +++ b/common/schemas/schema_viash.yaml @@ -0,0 +1,3135 @@ +$schema: "http://json-schema.org/draft-07/schema#" +definitions: + Config: + description: "A Viash configuration is a YAML file which contains metadata to\ + \ describe the behaviour and build target(s) of a component. \nWe commonly\ + \ name this file `config.vsh.yaml` in our examples, but you can name it however\ + \ you choose. \n" + type: "object" + properties: + label: + description: "A clean version of the component's name. This is only used for\ + \ documentation." + type: "string" + license: + description: "The license of the package." + type: "string" + authors: + description: "A list of authors. An author must at least have a name, but\ + \ can also have a list of roles, an e-mail address, and a map of custom\ + \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\ + \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\ + \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\ + \ |\n| author | aut | for persons who have made substantial contributions\ + \ to the software. |\n| contributor | ctb| for persons who have made smaller\ + \ contributions (such as code patches).\n| datacontributor | dtc | for persons\ + \ or organisations that contributed data sets for the software\n| copyrightholder\ + \ | cph | for all copyright holders. This is a legal concept so should use\ + \ the legal name of an institution or corporate body.\n| funder | fnd |\ + \ for persons or organizations that furnished financial support for the\ + \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ + \ is extremely comprehensive.\n" + type: "array" + items: + $ref: "#/definitions/Author" + status: + description: "Allows setting a component to active, deprecated or disabled." + $ref: "#/definitions/Status" + requirements: + description: "Computational requirements related to running the component.\ + \ \n`cpus` specifies the maximum number of (logical) cpus a component is\ + \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\ + \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\ + \ GB, TB or PB for SI units (1000-base), or KiB, MiB, GiB, TiB or PiB for\ + \ binary IEC units (1024-base)." + $ref: "#/definitions/ComputationalRequirements" + repositories: + description: "(Pre-)defines repositories that can be used as repository in\ + \ dependencies.\nAllows reusing repository definitions in case it is used\ + \ in multiple dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + dependencies: + description: "Allows listing Viash components required by this Viash component" + type: "array" + items: + $ref: "#/definitions/Dependency" + summary: + description: "A one-sentence summary of the component. This is only used for\ + \ documentation." + type: "string" + functionality: + description: "The functionality describes the behaviour of the script in terms\ + \ of arguments and resources.\nBy specifying a few restrictions (e.g. mandatory\ + \ arguments) and adding some descriptions, Viash will automatically generate\ + \ a stylish command-line interface for you.\n" + $ref: "#/definitions/Functionality" + runners: + description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ + \ - NextflowRunner\n" + type: "array" + items: + $ref: "#/definitions/Runner" + name: + description: "Name of the component and the filename of the executable when\ + \ built with `viash build`." + type: "string" + build_info: + $ref: "#/definitions/BuildInfo" + argument_groups: + description: "A grouping of the arguments, used to display the help message.\n\ + \n - `name: foo`, the name of the argument group. \n - `description: Description\ + \ of foo`, a description of the argument group. Multiline descriptions are\ + \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ + \n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" + description: + description: "A description of the component. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + usage: + description: "A description on how to use the component. This will be displayed\ + \ with `--help` under the 'Usage:' section." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + package_config: + description: "The package config content used during build." + $ref: "#/definitions/PackageConfig" + platforms: + description: "A list of platforms to generate target artifacts for.\n\n -\ + \ Native\n - Docker\n - Nextflow\n" + type: "array" + items: + $ref: "#/definitions/Platform" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + links: + description: "External links of the component." + $ref: "#/definitions/Links" + references: + description: "References to external resources related to the component." + $ref: "#/definitions/References" + engines: + description: "A list of engine environments to execute target artifacts in.\n\ + \n - NativeEngine\n - DockerEngine\n" + type: "array" + items: + $ref: "#/definitions/Engine" + resources: + description: "Resources are files that support the component. The first resource\ + \ should be a script that will be executed when the component is run. Additional\ + \ resources will be copied to the same directory.\n\nCommon properties:\n\ + \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ + \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + keywords: + description: "The keywords of the components." + type: "array" + items: + type: "string" + test_resources: + description: "One or more scripts to be used to test the component behaviour\ + \ when `viash test` is invoked. Additional files of type `file` will be\ + \ made available only during testing. Each test script should expect no\ + \ command-line inputs, be platform-independent, and return an exit code\ + \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\ + \ more info." + type: "array" + items: + $ref: "#/definitions/Resource" + namespace: + description: "Namespace this component is a part of. See the Namespaces guide\ + \ for more information on namespaces." + type: "string" + arguments: + description: "A list of arguments for this component. For each argument, a\ + \ type and a name must be specified. Depending on the type of argument,\ + \ different properties can be set. See these reference pages per type for\ + \ more information: \n\n - string\n - file\n - integer\n - double\n - boolean\n\ + \ - boolean_true\n - boolean_false\n" + type: "array" + items: + $ref: "#/definitions/Argument" + required: + - "name" + additionalProperties: false + PackageConfig: + description: "A Viash package configuration file. It's name should be `_viash.yaml`." + type: "object" + properties: + organization: + description: "The organization of the package." + type: "string" + name: + description: "The name of the package." + type: "string" + source: + description: "Which source directory to use for the `viash ns` commands." + type: "string" + description: + description: "A description of the package. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + config_mods: + oneOf: + - description: "Which config mods to apply." + type: "string" + - type: "array" + items: + description: "Which config mods to apply." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + license: + description: "The license of the package." + type: "string" + references: + description: "References to external resources related to the package." + $ref: "#/definitions/References" + authors: + description: "The authors of the package." + type: "array" + items: + $ref: "#/definitions/Author" + repositories: + description: "Common repository definitions for component dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + keywords: + description: "The keywords of the package." + type: "array" + items: + type: "string" + target: + description: "Which target directory to use for `viash ns build`." + type: "string" + summary: + description: "A one-sentence summary of the package. This is only used for\ + \ documentation." + type: "string" + viash_version: + description: "Which version of Viash to use." + type: "string" + label: + description: "A clean version of the package name. This is only used for documentation." + type: "string" + version: + description: "The version of the package." + type: "string" + links: + description: "External links of the package." + $ref: "#/definitions/Links" + required: [] + additionalProperties: false + BuildInfo: + description: "Meta information fields filled in by Viash during build." + type: "object" + properties: + git_tag: + description: "Git tag." + type: "string" + git_remote: + description: "Git remote name." + type: "string" + viash_version: + description: "The Viash version that was used to build the component." + type: "string" + output: + description: "Folder path to the build artifacts." + type: "string" + git_commit: + description: "Git commit hash." + type: "string" + engine: + description: "The engine id used during build." + type: "string" + runner: + description: "The runner id used during build." + type: "string" + config: + description: "Path to the config used during build." + type: "string" + dependencies: + description: "List of dependencies used during build." + type: "array" + items: + type: "string" + executable: + description: "Output folder with main executable path." + type: "string" + required: + - "config" + additionalProperties: false + Functionality: + description: "The functionality-part of the config file describes the behaviour\ + \ of the script in terms of arguments and resources.\nBy specifying a few restrictions\ + \ (e.g. mandatory arguments) and adding some descriptions, Viash will automatically\ + \ generate a stylish command-line interface for you.\n" + type: "object" + properties: + organization: + description: "The organization of the package." + type: "string" + name: + description: "Name of the component and the filename of the executable when\ + \ built with `viash build`." + type: "string" + argument_groups: + description: "A grouping of the arguments, used to display the help message.\n\ + \n - `name: foo`, the name of the argument group. \n - `description: Description\ + \ of foo`, a description of the argument group. Multiline descriptions are\ + \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ + \n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + license: + description: "The license of the package." + type: "string" + references: + description: "References to external resources related to the component." + $ref: "#/definitions/References" + authors: + description: "A list of authors. An author must at least have a name, but\ + \ can also have a list of roles, an e-mail address, and a map of custom\ + \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\ + \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\ + \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\ + \ |\n| author | aut | for persons who have made substantial contributions\ + \ to the software. |\n| contributor | ctb| for persons who have made smaller\ + \ contributions (such as code patches).\n| datacontributor | dtc | for persons\ + \ or organisations that contributed data sets for the software\n| copyrightholder\ + \ | cph | for all copyright holders. This is a legal concept so should use\ + \ the legal name of an institution or corporate body.\n| funder | fnd |\ + \ for persons or organizations that furnished financial support for the\ + \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ + \ is extremely comprehensive.\n" + type: "array" + items: + $ref: "#/definitions/Author" + status: + description: "Allows setting a component to active, deprecated or disabled." + $ref: "#/definitions/Status" + requirements: + description: "Computational requirements related to running the component.\ + \ \n`cpus` specifies the maximum number of (logical) cpus a component is\ + \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\ + \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\ + \ GB, TB or PB for SI units (1000-base), or KiB, MiB, GiB, TiB or PiB for\ + \ binary IEC units (1024-base)." + $ref: "#/definitions/ComputationalRequirements" + repositories: + description: "(Pre-)defines repositories that can be used as repository in\ + \ dependencies.\nAllows reusing repository definitions in case it is used\ + \ in multiple dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + test_resources: + description: "One or more scripts to be used to test the component behaviour\ + \ when `viash test` is invoked. Additional files of type `file` will be\ + \ made available only during testing. Each test script should expect no\ + \ command-line inputs, be platform-independent, and return an exit code\ + \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\ + \ more info." + type: "array" + items: + $ref: "#/definitions/Resource" + dependencies: + description: "Allows listing Viash components required by this Viash component" + type: "array" + items: + $ref: "#/definitions/Dependency" + description: + description: "A description of the component. This will be displayed with\ + \ `--help`." + type: "string" + usage: + description: "A description on how to use the component. This will be displayed\ + \ with `--help` under the 'Usage:' section." + type: "string" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + links: + description: "External links of the component." + $ref: "#/definitions/Links" + resources: + description: "Resources are files that support the component. The first resource\ + \ should be a script that will be executed when the functionality is run.\ + \ Additional resources will be copied to the same directory.\n\nCommon properties:\n\ + \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ + \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + keywords: + description: "The keywords of the components." + type: "array" + items: + type: "string" + namespace: + description: "Namespace this component is a part of. See the Namespaces guide\ + \ for more information on namespaces." + type: "string" + arguments: + description: "A list of arguments for this component. For each argument, a\ + \ type and a name must be specified. Depending on the type of argument,\ + \ different properties can be set. See these reference pages per type for\ + \ more information: \n\n - string\n - file\n - integer\n - double\n - boolean\n\ + \ - boolean_true\n - boolean_false\n" + type: "array" + items: + $ref: "#/definitions/Argument" + required: + - "name" + additionalProperties: false + Author: + description: "Author metadata." + type: "object" + properties: + name: + description: "Full name of the author, usually in the name of FirstName MiddleName\ + \ LastName." + type: "string" + email: + description: "E-mail of the author." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + roles: + oneOf: + - description: "Role of the author. Suggested items:\n\n* \"author\": Authors\ + \ who have made substantial contributions to the component.\n* \"maintainer\"\ + : The maintainer of the component.\n* \"contributor\": Authors who have\ + \ made smaller contributions (such as code patches etc.).\n" + type: "string" + - type: "array" + items: + description: "Role of the author. Suggested items:\n\n* \"author\": Authors\ + \ who have made substantial contributions to the component.\n* \"maintainer\"\ + : The maintainer of the component.\n* \"contributor\": Authors who have\ + \ made smaller contributions (such as code patches etc.).\n" + type: "string" + required: + - "name" + additionalProperties: false + ComputationalRequirements: + description: "Computational requirements related to running the component." + type: "object" + properties: + cpus: + description: "The maximum number of (logical) cpus a component is allowed\ + \ to use." + type: "integer" + commands: + description: "A list of commands which should be present on the system for\ + \ the script to function." + type: "array" + items: + type: "string" + memory: + description: "The maximum amount of memory a component is allowed to allocate.\ + \ Unit must be one of B, KB, MB, GB, TB or PB for SI units (1000-base),\ + \ or KiB, MiB, GiB, TiB or PiB for binary IEC units (1024-base)." + type: "string" + required: [] + additionalProperties: false + ArgumentGroup: + description: "A grouping of the arguments, used to display the help message." + type: "object" + properties: + name: + description: "The name of the argument group." + type: "string" + description: + description: "A description of the argument group. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + label: + description: "A clean version of the argument group's name. This is only used\ + \ for documentation." + type: "string" + summary: + description: "A one-sentence summary of the argument group. This is only used\ + \ for documentation." + type: "string" + arguments: + description: "A list of arguments for this component. For each argument, a\ + \ type and a name must be specified. Depending on the type of argument,\ + \ different properties can be set. See these reference pages per type for\ + \ more information: \n\n - string\n - file\n - integer\n - double\n - boolean\n\ + \ - boolean_true\n - boolean_false\n" + type: "array" + items: + $ref: "#/definitions/Argument" + required: + - "name" + additionalProperties: false + Links: + description: "Links to external resources related to the component." + type: "object" + properties: + repository: + description: "Source repository url." + type: "string" + documentation: + description: "Documentation website url." + type: "string" + docker_registry: + description: "Docker registry url." + type: "string" + homepage: + description: "Homepage website url." + type: "string" + issue_tracker: + description: "Issue tracker url." + type: "string" + required: [] + additionalProperties: false + References: + description: "A list of scholarly sources or publications relevant to the tools\ + \ or analysis defined in the component. This is important for attribution, scientific\ + \ reproducibility and transparency." + type: "object" + properties: + bibtex: + oneOf: + - description: "One or multiple BibTeX reference(s) of the component." + type: "string" + - type: "array" + items: + description: "One or multiple BibTeX reference(s) of the component." + type: "string" + doi: + oneOf: + - description: "One or multiple DOI reference(s) of the component." + type: "string" + - type: "array" + items: + description: "One or multiple DOI reference(s) of the component." + type: "string" + required: [] + additionalProperties: false + Runner: + oneOf: + - $ref: "#/definitions/ExecutableRunner" + - $ref: "#/definitions/NextflowRunner" + ExecutableRunner: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner is\ + \ also used for the native engine.\n\nThis runner is also used for the docker\ + \ engine.\n" + type: "object" + properties: + docker_setup_strategy: + description: "The Docker setup strategy to use when building a docker engine\ + \ enrivonment.\n\n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild`\ + \ / `build` / `b` | Always build the image from the dockerfile. This is\ + \ the default setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb`\ + \ | Always build the image from the dockerfile, with caching enabled.\n\ + | `ifneedbebuild` | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ + \ | Build the image with caching enabled if it does not exist locally, with\ + \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ + \ from [Docker Hub](https://hub.docker.com) or the specified docker registry.\n\ + | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ + \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ + \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ + \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ + \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ + \ If the image does not exist locally, pull the image. If the image does\ + \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ + \ not exist locally, pull the image. If the image does exist, build it with\ + \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ + \ or the specified docker registry.\n| `pushifnotpresent` | Push the container\ + \ to [Docker Hub](https://hub.docker.com) or the specified docker registry\ + \ if the tag does not exist yet.\n| `donothing` / `meh` | Do not build or\ + \ pull anything.\n\n" + $ref: "#/definitions/DockerSetupStrategy" + workdir: + description: "The working directory when starting the engine. This doesn't\ + \ change the Dockerfile but gets added as a command-line argument at runtime." + type: "string" + docker_run_args: + oneOf: + - description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + - type: "array" + items: + description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + id: + description: "Name of the runner. As with all runners, you can give an runner\ + \ a different name. By specifying `id: foo`, you can target this executor\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + port: + oneOf: + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "integer" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "string" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "integer" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "string" + type: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner\ + \ is also used for the native engine.\n\nThis runner is also used for the\ + \ docker engine.\n" + const: "executable" + required: + - "type" + additionalProperties: false + NextflowRunner: + description: "Run a Viash component on a Nextflow backend engine.\n" + type: "object" + properties: + auto: + description: "Automated processing flags which can be toggled on or off:\n\ + \n| Flag | Description | Default |\n|---|---------|----|\n| `simplifyInput`\ + \ | If `true`, an input tuple only containing only a single File (e.g. `[\"\ + foo\", file(\"in.h5ad\")]`) is automatically transformed to a map (i.e.\ + \ `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n| `simplifyOutput`\ + \ | If `true`, an output tuple containing a map with a File (e.g. `[\"foo\"\ + , [ output: file(\"out.h5ad\") ] ]`) is automatically transformed to a map\ + \ (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `false` |\n| `transcript` |\ + \ If `true`, the module's transcripts from `work/` are automatically published\ + \ to `params.transcriptDir`. If not defined, `params.publishDir + \"/_transcripts\"\ + ` will be used. Will throw an error if neither are defined. | `false` |\n\ + | `publish` | If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`. If equal to \"state\", also a `.state.yaml` file\ + \ will be published in the publish dir. Will throw an error if `params.publishDir`\ + \ is not defined. | `false` |\n\n" + $ref: "#/definitions/NextflowAuto" + directives: + description: "Directives are optional settings that affect the execution of\ + \ the process. These mostly match up with the Nextflow counterparts. \n" + $ref: "#/definitions/NextflowDirectives" + container: + description: "Specifies the Docker engine id to be used to run Nextflow." + type: "string" + config: + description: "Allows tweaking how the Nextflow Config file is generated." + $ref: "#/definitions/NextflowConfig" + debug: + description: "Whether or not to print debug messages." + type: "boolean" + id: + description: "Name of the runner. As with all runners, you can give an runner\ + \ a different name. By specifying `id: foo`, you can target this runner\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + type: + description: "Run a Viash component on a Nextflow backend engine.\n" + const: "nextflow" + required: + - "type" + additionalProperties: false + Engine: + oneOf: + - $ref: "#/definitions/DockerEngine" + - $ref: "#/definitions/NativeEngine" + NativeEngine: + description: "Running a Viash component on a native engine means that the script\ + \ will be executed in your current environment.\nAny dependencies are assumed\ + \ to have been installed by the user, so the native engine is meant for developers\ + \ (who know what they're doing) or for simple bash scripts (which have no extra\ + \ dependencies).\n" + type: "object" + properties: + id: + description: "Name of the engine. As with all engines, you can give an engine\ + \ a different name. By specifying `id: foo`, you can target this engine\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + type: + description: "Running a Viash component on a native engine means that the\ + \ script will be executed in your current environment.\nAny dependencies\ + \ are assumed to have been installed by the user, so the native engine is\ + \ meant for developers (who know what they're doing) or for simple bash\ + \ scripts (which have no extra dependencies).\n" + const: "native" + required: + - "type" + additionalProperties: false + DockerEngine: + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a docker\ + \ container from scratch using the setup flag, or pull it from a docker repository.\n" + type: "object" + properties: + organization: + description: "Name of a start container's [organization](https://docs.docker.com/docker-hub/orgs/)." + type: "string" + registry: + description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)\ + \ where the start container is located." + type: "string" + image: + description: "The base container to start from. You can also add the tag here\ + \ if you wish." + type: "string" + tag: + description: "Specify a Docker image based on its tag." + type: "string" + target_image: + description: "If anything is specified in the setup section, running the `---setup`\ + \ will result in an image with the name of `:`. If\ + \ nothing is specified in the `setup` section, simply `image` will be used.\ + \ Advanced usage only." + type: "string" + target_tag: + description: "The tag the resulting image gets. Advanced usage only." + type: "string" + namespace_separator: + description: "The separator between the namespace and the name of the component,\ + \ used for determining the image name. Default: \"/\"." + type: "string" + target_package: + description: "The package name set in the resulting image. Advanced usage\ + \ only." + type: "string" + id: + description: "Name of the engine. As with all engines, you can give a engine\ + \ a different name. By specifying `id: foo`, you can target this engine\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" + type: + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a\ + \ docker container from scratch using the setup flag, or pull it from a\ + \ docker repository.\n" + const: "docker" + target_organization: + description: "The organization set in the resulting image. Advanced usage\ + \ only." + type: "string" + setup: + description: "A list of requirements for installing the following types of\ + \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ + \ - Python\n - R\n - Ruby\n - yum\n\nThe order in which these dependencies\ + \ are specified determines the order in which they will be installed.\n" + type: "array" + items: + $ref: "#/definitions/Requirements" + cmd: + oneOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" + target_image_source: + description: "The source of the target image. This is used for defining labels\ + \ in the dockerfile." + type: "string" + test_setup: + description: "Additional requirements specific for running unit tests." + type: "array" + items: + $ref: "#/definitions/Requirements" + entrypoint: + oneOf: + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "string" + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "array" + items: + type: "string" + required: + - "image" + - "type" + additionalProperties: false + Platform: + oneOf: + - $ref: "#/definitions/NativePlatform" + - $ref: "#/definitions/DockerPlatform" + - $ref: "#/definitions/NextflowPlatform" + NativePlatform: + description: "Running a Viash component on a native platform means that the script\ + \ will be executed in your current environment.\nAny dependencies are assumed\ + \ to have been installed by the user, so the native platform is meant for developers\ + \ (who know what they're doing) or for simple bash scripts (which have no extra\ + \ dependencies).\n" + type: "object" + properties: + id: + description: "As with all platforms, you can give a platform a different name.\ + \ By specifying `id: foo`, you can target this platform (only) by specifying\ + \ `-p foo` in any of the Viash commands." + type: "string" + type: + description: "Running a Viash component on a native platform means that the\ + \ script will be executed in your current environment.\nAny dependencies\ + \ are assumed to have been installed by the user, so the native platform\ + \ is meant for developers (who know what they're doing) or for simple bash\ + \ scripts (which have no extra dependencies).\n" + const: "native" + required: + - "type" + additionalProperties: false + DockerPlatform: + description: "Run a Viash component on a Docker backend platform.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a docker\ + \ container from scratch using the setup flag, or pull it from a docker repository.\n" + type: "object" + properties: + organization: + description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." + type: "string" + registry: + description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" + type: "string" + image: + description: "The base container to start from. You can also add the tag here\ + \ if you wish." + type: "string" + tag: + description: "Specify a Docker image based on its tag." + type: "string" + target_tag: + description: "The tag the resulting image gets. Advanced usage only." + type: "string" + run_args: + oneOf: + - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ + \ arguments." + type: "string" + - type: "array" + items: + description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ + \ arguments." + type: "string" + namespace_separator: + description: "The separator between the namespace and the name of the component,\ + \ used for determining the image name. Default: \"/\"." + type: "string" + resolve_volume: + description: "Enables or disables automatic volume mapping. Enabled when set\ + \ to `Automatic` or disabled when set to `Manual`. Default: `Automatic`." + $ref: "#/definitions/DockerResolveVolume" + cmd: + oneOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" + id: + description: "As with all platforms, you can give a platform a different name.\ + \ By specifying `id: foo`, you can target this platform (only) by specifying\ + \ `-p foo` in any of the Viash commands." + type: "string" + port: + oneOf: + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "string" + - type: "array" + items: + description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "string" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" + setup: + description: "A list of requirements for installing the following types of\ + \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ + \ - Python\n - R\n - Ruby\n - yum\n\nThe order in which these dependencies\ + \ are specified determines the order in which they will be installed.\n" + type: "array" + items: + $ref: "#/definitions/Requirements" + workdir: + description: "The working directory when starting the container. This doesn't\ + \ change the Dockerfile but gets added as a command-line argument at runtime." + type: "string" + target_image: + description: "If anything is specified in the setup section, running the `---setup`\ + \ will result in an image with the name of `:`. If\ + \ nothing is specified in the `setup` section, simply `image` will be used.\ + \ Advanced usage only." + type: "string" + target_image_source: + description: "The source of the target image. This is used for defining labels\ + \ in the dockerfile." + type: "string" + test_setup: + description: "Additional requirements specific for running unit tests." + type: "array" + items: + $ref: "#/definitions/Requirements" + entrypoint: + oneOf: + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "string" + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "array" + items: + type: "string" + setup_strategy: + description: "The Docker setup strategy to use when building a container.\n\ + \n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild` / `build`\ + \ / `b` | Always build the image from the dockerfile. This is the default\ + \ setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb` | Always\ + \ build the image from the dockerfile, with caching enabled.\n| `ifneedbebuild`\ + \ | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ + \ | Build the image with caching enabled if it does not exist locally, with\ + \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ + \ from [Docker Hub](https://hub.docker.com) or the specified docker registry.\n\ + | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ + \ a registry and build it if it does not exist.\n| `alwayspullelsecachedbuild`\ + \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ + \ it with caching if it does not exist.\n| `ifneedbepull` | If the image\ + \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ + \ Do nothing if the image exists locally. Else, try to pull the image from\ + \ a registry. Otherwise build the image from scratch.\n| `ifneedbepullelsecachedbuild`\ + \ | Do nothing if the image exists locally. Else, try to pull the image\ + \ from a registry. Otherwise build the image with caching enabled.\n| `push`\ + \ | Push the container to [Docker Hub](https://hub.docker.com) or the specified\ + \ docker registry.\n| `pushifnotpresent` | Push the container to [Docker\ + \ Hub](https://hub.docker.com) or the specified docker registry if the tag\ + \ does not exist yet.\n| `donothing` / `meh` | Do not build or pull anything.\n\ + \n" + $ref: "#/definitions/DockerSetupStrategy" + type: + description: "Run a Viash component on a Docker backend platform.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a\ + \ docker container from scratch using the setup flag, or pull it from a\ + \ docker repository.\n" + const: "docker" + target_organization: + description: "The organization set in the resulting image. Advanced usage\ + \ only." + type: "string" + required: + - "image" + - "type" + additionalProperties: false + NextflowPlatform: + description: "Platform for generating Nextflow VDSL3 modules." + type: "object" + properties: + auto: + description: "Automated processing flags which can be toggled on or off:\n\ + \n| Flag | Description | Default |\n|---|---------|----|\n| `simplifyInput`\ + \ | If `true`, an input tuple only containing only a single File (e.g. `[\"\ + foo\", file(\"in.h5ad\")]`) is automatically transformed to a map (i.e.\ + \ `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n| `simplifyOutput`\ + \ | If `true`, an output tuple containing a map with a File (e.g. `[\"foo\"\ + , [ output: file(\"out.h5ad\") ] ]`) is automatically transformed to a map\ + \ (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `false` |\n| `transcript` |\ + \ If `true`, the module's transcripts from `work/` are automatically published\ + \ to `params.transcriptDir`. If not defined, `params.publishDir + \"/_transcripts\"\ + ` will be used. Will throw an error if neither are defined. | `false` |\n\ + | `publish` | If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`. If equal to \"state\", also a `.state.yaml` file\ + \ will be published in the publish dir. Will throw an error if `params.publishDir`\ + \ is not defined. | `false` |\n\n" + $ref: "#/definitions/NextflowAuto" + directives: + description: "Directives are optional settings that affect the execution of\ + \ the process. These mostly match up with the Nextflow counterparts. \n" + $ref: "#/definitions/NextflowDirectives" + container: + description: "Specifies the Docker platform id to be used to run Nextflow." + type: "string" + config: + description: "Allows tweaking how the Nextflow Config file is generated." + $ref: "#/definitions/NextflowConfig" + debug: + description: "Whether or not to print debug messages." + type: "boolean" + id: + description: "Every platform can be given a specific id that can later be\ + \ referred to explicitly when running or building the Viash component." + type: "string" + type: + description: "Platform for generating Nextflow VDSL3 modules." + const: "nextflow" + required: + - "type" + additionalProperties: false + Requirements: + oneOf: + - $ref: "#/definitions/ApkRequirements" + - $ref: "#/definitions/AptRequirements" + - $ref: "#/definitions/DockerRequirements" + - $ref: "#/definitions/JavaScriptRequirements" + - $ref: "#/definitions/PythonRequirements" + - $ref: "#/definitions/RRequirements" + - $ref: "#/definitions/RubyRequirements" + - $ref: "#/definitions/YumRequirements" + ApkRequirements: + description: "Specify which apk packages should be available in order to run the\ + \ component." + type: "object" + properties: + type: + description: "Specify which apk packages should be available in order to run\ + \ the component." + const: "apk" + packages: + oneOf: + - description: "Specifies which packages to install." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install." + type: "string" + required: + - "type" + additionalProperties: false + AptRequirements: + description: "Specify which apt packages should be available in order to run the\ + \ component." + type: "object" + properties: + interactive: + description: "If `false`, the Debian frontend is set to non-interactive (recommended).\ + \ Default: false." + type: "boolean" + type: + description: "Specify which apt packages should be available in order to run\ + \ the component." + const: "apt" + packages: + oneOf: + - description: "Specifies which packages to install." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install." + type: "string" + required: + - "type" + additionalProperties: false + DockerRequirements: + description: "Specify which Docker commands should be run during setup." + type: "object" + properties: + run: + oneOf: + - description: "Specifies which `RUN` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - type: "array" + items: + description: "Specifies which `RUN` entries to add to the Dockerfile while\ + \ building it." + type: "string" + label: + oneOf: + - description: "Specifies which `LABEL` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - type: "array" + items: + description: "Specifies which `LABEL` entries to add to the Dockerfile\ + \ while building it." + type: "string" + build_args: + oneOf: + - description: "Specifies which `ARG` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - type: "array" + items: + description: "Specifies which `ARG` entries to add to the Dockerfile while\ + \ building it." + type: "string" + copy: + oneOf: + - description: "Specifies which `COPY` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - type: "array" + items: + description: "Specifies which `COPY` entries to add to the Dockerfile\ + \ while building it." + type: "string" + type: + description: "Specify which Docker commands should be run during setup." + const: "docker" + add: + oneOf: + - description: "Specifies which `ADD` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - type: "array" + items: + description: "Specifies which `ADD` entries to add to the Dockerfile while\ + \ building it." + type: "string" + env: + oneOf: + - description: "Specifies which `ENV` entries to add to the Dockerfile while\ + \ building it. Unlike `ARG`, `ENV` entries are also accessible from inside\ + \ the container." + type: "string" + - type: "array" + items: + description: "Specifies which `ENV` entries to add to the Dockerfile while\ + \ building it. Unlike `ARG`, `ENV` entries are also accessible from\ + \ inside the container." + type: "string" + required: + - "type" + additionalProperties: false + JavaScriptRequirements: + description: "Specify which JavaScript packages should be available in order to\ + \ run the component." + type: "object" + properties: + github: + oneOf: + - description: "Specifies which packages to install from GitHub." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from GitHub." + type: "string" + url: + oneOf: + - description: "Specifies which packages to install using a generic URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a generic URI." + type: "string" + git: + oneOf: + - description: "Specifies which packages to install using a Git URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a Git URI." + type: "string" + npm: + oneOf: + - description: "Specifies which packages to install from npm." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from npm." + type: "string" + type: + description: "Specify which JavaScript packages should be available in order\ + \ to run the component." + const: "javascript" + packages: + oneOf: + - description: "Specifies which packages to install from npm." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from npm." + type: "string" + required: + - "type" + additionalProperties: false + PythonRequirements: + description: "Specify which Python packages should be available in order to run\ + \ the component." + type: "object" + properties: + github: + oneOf: + - description: "Specifies which packages to install from GitHub." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from GitHub." + type: "string" + gitlab: + oneOf: + - description: "Specifies which packages to install from GitLab." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from GitLab." + type: "string" + pip: + oneOf: + - description: "Specifies which packages to install from pip." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from pip." + type: "string" + pypi: + oneOf: + - description: "Specifies which packages to install from PyPI using pip." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from PyPI using pip." + type: "string" + git: + oneOf: + - description: "Specifies which packages to install using a Git URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a Git URI." + type: "string" + upgrade: + description: "Sets the `--upgrade` flag when set to true. Default: true." + type: "boolean" + packages: + oneOf: + - description: "Specifies which packages to install from pip." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from pip." + type: "string" + url: + oneOf: + - description: "Specifies which packages to install using a generic URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a generic URI." + type: "string" + svn: + oneOf: + - description: "Specifies which packages to install using an SVN URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using an SVN URI." + type: "string" + bazaar: + oneOf: + - description: "Specifies which packages to install using a Bazaar URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a Bazaar URI." + type: "string" + script: + oneOf: + - description: "Specifies a code block to run as part of the build." + type: "string" + - type: "array" + items: + description: "Specifies a code block to run as part of the build." + type: "string" + type: + description: "Specify which Python packages should be available in order to\ + \ run the component." + const: "python" + mercurial: + oneOf: + - description: "Specifies which packages to install using a Mercurial URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a Mercurial URI." + type: "string" + user: + description: "Sets the `--user` flag when set to true. Default: false." + type: "boolean" + required: + - "type" + additionalProperties: false + RRequirements: + description: "Specify which R packages should be available in order to run the\ + \ component." + type: "object" + properties: + bioc: + oneOf: + - description: "Specifies which packages to install from BioConductor." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from BioConductor." + type: "string" + github: + oneOf: + - description: "Specifies which packages to install from GitHub." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from GitHub." + type: "string" + gitlab: + oneOf: + - description: "Specifies which packages to install from GitLab." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from GitLab." + type: "string" + url: + oneOf: + - description: "Specifies which packages to install using a generic URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a generic URI." + type: "string" + bioc_force_install: + description: "Forces packages specified in `bioc` to be reinstalled, even\ + \ if they are already present in the container. Default: false." + type: "boolean" + git: + oneOf: + - description: "Specifies which packages to install using a Git URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using a Git URI." + type: "string" + cran: + oneOf: + - description: "Specifies which packages to install from CRAN." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from CRAN." + type: "string" + bitbucket: + oneOf: + - description: "Specifies which packages to install from Bitbucket." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from Bitbucket." + type: "string" + svn: + oneOf: + - description: "Specifies which packages to install using an SVN URI." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install using an SVN URI." + type: "string" + packages: + oneOf: + - description: "Specifies which packages to install from CRAN." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install from CRAN." + type: "string" + script: + oneOf: + - description: "Specifies a code block to run as part of the build." + type: "string" + - type: "array" + items: + description: "Specifies a code block to run as part of the build." + type: "string" + type: + description: "Specify which R packages should be available in order to run\ + \ the component." + const: "r" + required: + - "type" + additionalProperties: false + RubyRequirements: + description: "Specify which Ruby packages should be available in order to run\ + \ the component." + type: "object" + properties: + type: + description: "Specify which Ruby packages should be available in order to\ + \ run the component." + const: "ruby" + packages: + oneOf: + - description: "Specifies which packages to install." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install." + type: "string" + required: + - "type" + additionalProperties: false + YumRequirements: + description: "Specify which yum packages should be available in order to run the\ + \ component." + type: "object" + properties: + type: + description: "Specify which yum packages should be available in order to run\ + \ the component." + const: "yum" + packages: + oneOf: + - description: "Specifies which packages to install." + type: "string" + - type: "array" + items: + description: "Specifies which packages to install." + type: "string" + required: + - "type" + additionalProperties: false + Argument: + oneOf: + - $ref: "#/definitions/BooleanArgument" + - $ref: "#/definitions/BooleanTrueArgument" + - $ref: "#/definitions/BooleanFalseArgument" + - $ref: "#/definitions/DoubleArgument" + - $ref: "#/definitions/FileArgument" + - $ref: "#/definitions/IntegerArgument" + - $ref: "#/definitions/LongArgument" + - $ref: "#/definitions/StringArgument" + BooleanArgument: + description: "A `boolean` type argument has two possible values: `true` or `false`." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--trim`, `-t`\ + \ or `trim`. The number of dashes determines how values can be passed: \ + \ \n\n - `--trim` is a long option, which can be passed with `executable_name\ + \ --trim`\n - `-t` is a short option, which can be passed with `executable_name\ + \ -t`\n - `trim` is an argument, which can be passed with `executable_name\ + \ trim` \n" + type: "string" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + direction: + $ref: "#/definitions/Direction" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + default: + oneOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "boolean" + - type: "array" + items: + description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "boolean" + example: + oneOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "boolean" + - type: "array" + items: + description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "boolean" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `boolean` type argument has two possible values: `true` or\ + \ `false`." + const: "boolean" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + BooleanTrueArgument: + description: "An argument of the `boolean_true` type acts like a `boolean` flag\ + \ with a default value of `false`. When called as an argument it sets the `boolean`\ + \ to `true`." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--silent`,\ + \ `-s` or `silent`. The number of dashes determines how values can be passed:\ + \ \n\n - `--silent` is a long option, which can be passed with `executable_name\ + \ --silent`\n - `-s` is a short option, which can be passed with `executable_name\ + \ -s`\n - `silent` is an argument, which can be passed with `executable_name\ + \ silent` \n" + type: "string" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + direction: + $ref: "#/definitions/Direction" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + type: + description: "An argument of the `boolean_true` type acts like a `boolean`\ + \ flag with a default value of `false`. When called as an argument it sets\ + \ the `boolean` to `true`." + const: "boolean_true" + required: + - "name" + - "type" + additionalProperties: false + BooleanFalseArgument: + description: "An argument of the `boolean_false` type acts like an inverted `boolean`\ + \ flag with a default value of `true`. When called as an argument it sets the\ + \ `boolean` to `false`." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--no-log`,\ + \ `-n` or `no-log`. The number of dashes determines how values can be passed:\ + \ \n\n - `--no-log` is a long option, which can be passed with `executable_name\ + \ --no-log`\n - `-n` is a short option, which can be passed with `executable_name\ + \ -n`\n - `no-log` is an argument, which can be passed with `executable_name\ + \ no-log` \n" + type: "string" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + direction: + $ref: "#/definitions/Direction" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + type: + description: "An argument of the `boolean_false` type acts like an inverted\ + \ `boolean` flag with a default value of `true`. When called as an argument\ + \ it sets the `boolean` to `false`." + const: "boolean_false" + required: + - "name" + - "type" + additionalProperties: false + DoubleArgument: + description: "A `double` type argument has a numeric value with decimal points" + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + max: + description: "Maximum allowed value for this argument. If set and the provided\ + \ value is higher than the maximum, an error will be produced. Can be combined\ + \ with [`min`](#min) to clamp values." + $ref: "#/definitions/DoubleWithInf" + default: + oneOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + $ref: "#/definitions/DoubleWithInf" + - type: "array" + items: + description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + $ref: "#/definitions/DoubleWithInf" + example: + oneOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + $ref: "#/definitions/DoubleWithInf" + - type: "array" + items: + description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + $ref: "#/definitions/DoubleWithInf" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + min: + description: "Minimum allowed value for this argument. If set and the provided\ + \ value is lower than the minimum, an error will be produced. Can be combined\ + \ with [`max`](#max) to clamp values." + $ref: "#/definitions/DoubleWithInf" + direction: + $ref: "#/definitions/Direction" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `double` type argument has a numeric value with decimal points" + const: "double" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + FileArgument: + description: "A `file` type argument has a string value that points to a file\ + \ or folder path." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + create_parent: + description: "If the output filename is a path and it does not exist, create\ + \ it before executing the script (only for `direction: output`)." + type: "boolean" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + direction: + description: "Makes this argument an `input` or an `output`, as in does the\ + \ file/folder needs to be read or written. `input` by default." + $ref: "#/definitions/Direction" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + must_exist: + description: "Checks whether the file or folder exists. For input files, this\ + \ check will happen before the execution of the script, while for output\ + \ files the check will happen afterwards." + type: "boolean" + default: + oneOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "string" + - type: "array" + items: + description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "string" + example: + oneOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "string" + - type: "array" + items: + description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "string" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + multiple: + description: "Allow for multiple values (`false` by default).\n\nFor input\ + \ arguments, this will be treated as a list of values. For example, values\n\ + can be passed using the delimiter `--foo=1:2:3` or by providing the same\ + \ argument \nmultiple times `--foo 1 --foo 2`. You can use a custom delimiter\ + \ by using the \n[`multiple_sep`](#multiple_sep) property.\n\nFor output\ + \ file arguments, the passed value needs to contain a wildcard. For example,\n\ + `--foo 'foo_*.txt'` will be treated as a list of files that match the pattern.\ + \ Note that in Bash,\n the wildcard will need to be in quotes (\"foo_*.txt\"\ + \ or `'foo_*.txt'`) or else Bash will\n automatically attempt to expand\ + \ the expression.\n\nOther output arguments (e.g. integer, double, ...)\ + \ are not supported yet.\n" + type: "boolean" + type: + description: "A `file` type argument has a string value that points to a file\ + \ or folder path." + const: "file" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + IntegerArgument: + description: "An `integer` type argument has a numeric value without decimal points." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + choices: + description: "Limit the amount of valid values for this argument to those\ + \ set in this list. When set and a value not present in the list is provided,\ + \ an error will be produced." + type: "array" + items: + type: "integer" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + max: + description: "Maximum allowed value for this argument. If set and the provided\ + \ value is higher than the maximum, an error will be produced. Can be combined\ + \ with [`min`](#min) to clamp values." + type: "integer" + default: + oneOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "integer" + - type: "array" + items: + description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "integer" + example: + oneOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "integer" + - type: "array" + items: + description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "integer" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + min: + description: "Minimum allowed value for this argument. If set and the provided\ + \ value is lower than the minimum, an error will be produced. Can be combined\ + \ with [`max`](#max) to clamp values." + type: "integer" + direction: + $ref: "#/definitions/Direction" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "An `integer` type argument has a numeric value without decimal\ + \ points." + const: "integer" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + LongArgument: + description: "An `long` type argument has a numeric value without decimal points." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + choices: + description: "Limit the amount of valid values for this argument to those\ + \ set in this list. When set and a value not present in the list is provided,\ + \ an error will be produced." + type: "array" + items: + type: "integer" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + max: + description: "Maximum allowed value for this argument. If set and the provided\ + \ value is higher than the maximum, an error will be produced. Can be combined\ + \ with [`min`](#min) to clamp values." + type: "integer" + default: + oneOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "integer" + - type: "array" + items: + description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "integer" + example: + oneOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "integer" + - type: "array" + items: + description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "integer" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + min: + description: "Minimum allowed value for this argument. If set and the provided\ + \ value is lower than the minimum, an error will be produced. Can be combined\ + \ with [`max`](#max) to clamp values." + type: "integer" + direction: + $ref: "#/definitions/Direction" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "An `long` type argument has a numeric value without decimal\ + \ points." + const: "long" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + StringArgument: + description: "A `string` type argument has a value made up of an ordered sequences\ + \ of characters, like \"Hello\" or \"I'm a string\"." + type: "object" + properties: + alternatives: + oneOf: + - description: "List of alternative format variations for this argument." + type: "string" + - type: "array" + items: + description: "List of alternative format variations for this argument." + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + choices: + description: "Limit the amount of valid values for this argument to those\ + \ set in this list. When set and a value not present in the list is provided,\ + \ an error will be produced." + type: "array" + items: + type: "string" + label: + description: "A clean version of the argument's name. This is only used for\ + \ documentation." + type: "string" + direction: + $ref: "#/definitions/Direction" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + default: + oneOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "string" + - type: "array" + items: + description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "string" + example: + oneOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "string" + - type: "array" + items: + description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "string" + summary: + description: "A one-sentence summary of the argument. This is only used for\ + \ documentation." + type: "string" + description: + description: "A description of the argument. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `string` type argument has a value made up of an ordered sequences\ + \ of characters, like \"Hello\" or \"I'm a string\"." + const: "string" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + Resource: + oneOf: + - $ref: "#/definitions/BashScript" + - $ref: "#/definitions/CSharpScript" + - $ref: "#/definitions/Executable" + - $ref: "#/definitions/JavaScriptScript" + - $ref: "#/definitions/NextflowScript" + - $ref: "#/definitions/PlainFile" + - $ref: "#/definitions/PythonScript" + - $ref: "#/definitions/RScript" + - $ref: "#/definitions/ScalaScript" + BashScript: + description: "An executable Bash script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable Bash script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." + const: "bash_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + CSharpScript: + description: "An executable C# script.\nWhen defined in resources, only the first\ + \ entry will be executed when running the built component or when running `viash\ + \ run`.\nWhen defined in test_resources, all entries will be executed during\ + \ `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable C# script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." + const: "csharp_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + Executable: + description: "An executable file." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable file." + const: "executable" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + JavaScriptScript: + description: "An executable JavaScript script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will be\ + \ executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable JavaScript script.\nWhen defined in resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in test_resources, all entries\ + \ will be executed during `viash test`." + const: "javascript_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + NextflowScript: + description: "A Nextflow script. Work in progress; added mainly for annotation\ + \ at the moment." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + entrypoint: + description: "The name of the workflow to be wrapped." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "A Nextflow script. Work in progress; added mainly for annotation\ + \ at the moment." + const: "nextflow_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "entrypoint" + - "type" + additionalProperties: false + PlainFile: + description: "A plain file. This can only be used as a supporting resource for\ + \ the main script or unit tests." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "A plain file. This can only be used as a supporting resource\ + \ for the main script or unit tests." + const: "file" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: [] + additionalProperties: false + PythonScript: + description: "An executable Python script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable Python script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." + const: "python_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + RScript: + description: "An executable R script.\nWhen defined in resources, only the first\ + \ entry will be executed when running the built component or when running `viash\ + \ run`.\nWhen defined in test_resources, all entries will be executed during\ + \ `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable R script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." + const: "r_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + ScalaScript: + description: "An executable Scala script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable Scala script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." + const: "scala_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + NextflowDirectives: + description: "Directives are optional settings that affect the execution of the\ + \ process.\n" + type: "object" + properties: + beforeScript: + description: "The `beforeScript` directive allows you to execute a custom\ + \ (Bash) snippet before the main process script is run. This may be useful\ + \ to initialise the underlying cluster environment or for other custom initialisation.\n\ + \nSee [`beforeScript`](https://www.nextflow.io/docs/latest/process.html#beforeScript).\n" + type: "string" + module: + oneOf: + - description: "Environment Modules is a package manager that allows you to\ + \ dynamically configure your execution environment and easily switch between\ + \ multiple versions of the same software tool.\n\nIf it is available in\ + \ your system you can use it with Nextflow in order to configure the processes\ + \ execution environment in your pipeline.\n\nIn a process definition you\ + \ can use the `module` directive to load a specific module version to\ + \ be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" + type: "string" + - type: "array" + items: + description: "Environment Modules is a package manager that allows you\ + \ to dynamically configure your execution environment and easily switch\ + \ between multiple versions of the same software tool.\n\nIf it is available\ + \ in your system you can use it with Nextflow in order to configure\ + \ the processes execution environment in your pipeline.\n\nIn a process\ + \ definition you can use the `module` directive to load a specific module\ + \ version to be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" + type: "string" + queue: + oneOf: + - description: "The `queue` directory allows you to set the queue where jobs\ + \ are scheduled when using a grid based executor in your pipeline.\n\n\ + See [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" + type: "string" + - type: "array" + items: + description: "The `queue` directory allows you to set the queue where\ + \ jobs are scheduled when using a grid based executor in your pipeline.\n\ + \nSee [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" + type: "string" + label: + oneOf: + - description: "The `label` directive allows the annotation of processes with\ + \ mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" + type: "string" + - type: "array" + items: + description: "The `label` directive allows the annotation of processes\ + \ with mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" + type: "string" + container: + oneOf: + - description: "The `container` directive allows you to execute the process\ + \ script in a Docker container.\n\nIt requires the Docker daemon to be\ + \ running in machine where the pipeline is executed, i.e. the local machine\ + \ when using the local executor or the cluster nodes when the pipeline\ + \ is deployed through a grid executor.\n\nViash implements allows either\ + \ a string value or a map. In case a map is used, the allowed keys are:\ + \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ + \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" + type: "object" + additionalProperties: + type: "string" + - description: "The `container` directive allows you to execute the process\ + \ script in a Docker container.\n\nIt requires the Docker daemon to be\ + \ running in machine where the pipeline is executed, i.e. the local machine\ + \ when using the local executor or the cluster nodes when the pipeline\ + \ is deployed through a grid executor.\n\nViash implements allows either\ + \ a string value or a map. In case a map is used, the allowed keys are:\ + \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ + \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" + type: "string" + publishDir: + oneOf: + - oneOf: + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ + \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ + \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "string" + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ + \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ + \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "object" + additionalProperties: + type: "string" + - type: "array" + items: + oneOf: + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ + \ `path` key and value are required.\nThe allowed values for `mode`\ + \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ + \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "string" + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ + \ `path` key and value are required.\nThe allowed values for `mode`\ + \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ + \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "object" + additionalProperties: + type: "string" + maxForks: + oneOf: + - description: "The `maxForks` directive allows you to define the maximum\ + \ number of process instances that can be executed in parallel. By default\ + \ this value is equals to the number of CPU cores available minus 1.\n\ + \nIf you want to execute a process in a sequential manner, set this directive\ + \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" + type: "string" + - description: "The `maxForks` directive allows you to define the maximum\ + \ number of process instances that can be executed in parallel. By default\ + \ this value is equals to the number of CPU cores available minus 1.\n\ + \nIf you want to execute a process in a sequential manner, set this directive\ + \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" + type: "integer" + maxErrors: + oneOf: + - description: "The `maxErrors` directive allows you to specify the maximum\ + \ number of times a process can fail when using the `retry` error strategy.\ + \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" + type: "string" + - description: "The `maxErrors` directive allows you to specify the maximum\ + \ number of times a process can fail when using the `retry` error strategy.\ + \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" + type: "integer" + cpus: + oneOf: + - description: "The `cpus` directive allows you to define the number of (logical)\ + \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" + type: "integer" + - description: "The `cpus` directive allows you to define the number of (logical)\ + \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" + type: "string" + accelerator: + description: "The `accelerator` directive allows you to specify the hardware\ + \ accelerator requirement for the task execution e.g. GPU processor.\n\n\ + Viash implements this directive as a map with accepted keywords: `type`,\ + \ `limit`, `request`, and `runtime`.\n\nSee [`accelerator`](https://www.nextflow.io/docs/latest/process.html#accelerator).\n" + type: "object" + additionalProperties: + type: "string" + time: + description: "The `time` directive allows you to define how long a process\ + \ is allowed to run.\n\nSee [`time`](https://www.nextflow.io/docs/latest/process.html#time).\n" + type: "string" + afterScript: + description: "The `afterScript` directive allows you to execute a custom (Bash)\ + \ snippet immediately after the main process has run. This may be useful\ + \ to clean up your staging area.\n\nSee [`afterScript`](https://www.nextflow.io/docs/latest/process.html#afterscript).\n" + type: "string" + executor: + description: "The `executor` defines the underlying system where processes\ + \ are executed. By default a process uses the executor defined globally\ + \ in the nextflow.config file.\n\nThe `executor` directive allows you to\ + \ configure what executor has to be used by the process, overriding the\ + \ default configuration. The following values can be used:\n\n| Name | Executor\ + \ |\n|------|----------|\n| awsbatch | The process is executed using the\ + \ AWS Batch service. | \n| azurebatch | The process is executed using the\ + \ Azure Batch service. | \n| condor | The process is executed using the\ + \ HTCondor job scheduler. | \n| google-lifesciences | The process is executed\ + \ using the Google Genomics Pipelines service. | \n| ignite | The process\ + \ is executed using the Apache Ignite cluster. | \n| k8s | The process is\ + \ executed using the Kubernetes cluster. | \n| local | The process is executed\ + \ in the computer where Nextflow is launched. | \n| lsf | The process is\ + \ executed using the Platform LSF job scheduler. | \n| moab | The process\ + \ is executed using the Moab job scheduler. | \n| nqsii | The process is\ + \ executed using the NQSII job scheduler. | \n| oge | Alias for the sge\ + \ executor. | \n| pbs | The process is executed using the PBS/Torque job\ + \ scheduler. | \n| pbspro | The process is executed using the PBS Pro job\ + \ scheduler. | \n| sge | The process is executed using the Sun Grid Engine\ + \ / Open Grid Engine. | \n| slurm | The process is executed using the SLURM\ + \ job scheduler. | \n| tes | The process is executed using the GA4GH TES\ + \ service. | \n| uge | Alias for the sge executor. |\n\nSee [`executor`](https://www.nextflow.io/docs/latest/process.html#executor).\n" + type: "string" + containerOptions: + oneOf: + - description: "The `containerOptions` directive allows you to specify any\ + \ container execution option supported by the underlying container engine\ + \ (ie. Docker, Singularity, etc). This can be useful to provide container\ + \ settings only for a specific process e.g. mount a custom path.\n\nSee\ + \ [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" + type: "string" + - type: "array" + items: + description: "The `containerOptions` directive allows you to specify any\ + \ container execution option supported by the underlying container engine\ + \ (ie. Docker, Singularity, etc). This can be useful to provide container\ + \ settings only for a specific process e.g. mount a custom path.\n\n\ + See [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" + type: "string" + disk: + description: "The `disk` directive allows you to define how much local disk\ + \ storage the process is allowed to use.\n\nSee [`disk`](https://www.nextflow.io/docs/latest/process.html#disk).\n" + type: "string" + tag: + description: "The `tag` directive allows you to associate each process execution\ + \ with a custom label, so that it will be easier to identify them in the\ + \ log file or in the trace execution report.\n\nFor ease of use, the default\ + \ tag is set to \"$id\", which allows tracking the progression of the channel\ + \ events through the workflow more easily.\n\nSee [`tag`](https://www.nextflow.io/docs/latest/process.html#tag).\n" + type: "string" + conda: + oneOf: + - description: "The `conda` directive allows for the definition of the process\ + \ dependencies using the Conda package manager.\n\nNextflow automatically\ + \ sets up an environment for the given package names listed by in the\ + \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" + type: "string" + - type: "array" + items: + description: "The `conda` directive allows for the definition of the process\ + \ dependencies using the Conda package manager.\n\nNextflow automatically\ + \ sets up an environment for the given package names listed by in the\ + \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" + type: "string" + machineType: + description: " The `machineType` can be used to specify a predefined Google\ + \ Compute Platform machine type when running using the Google Life Sciences\ + \ executor.\n\nSee [`machineType`](https://www.nextflow.io/docs/latest/process.html#machinetype).\n" + type: "string" + stageInMode: + description: "The `stageInMode` directive defines how input files are staged-in\ + \ to the process work directory. The following values are allowed:\n\n|\ + \ Value | Description |\n|-------|-------------| \n| copy | Input files\ + \ are staged in the process work directory by creating a copy. | \n| link\ + \ | Input files are staged in the process work directory by creating an\ + \ (hard) link for each of them. | \n| symlink | Input files are staged in\ + \ the process work directory by creating a symbolic link with an absolute\ + \ path for each of them (default). | \n| rellink | Input files are staged\ + \ in the process work directory by creating a symbolic link with a relative\ + \ path for each of them. | \n\nSee [`stageInMode`](https://www.nextflow.io/docs/latest/process.html#stageinmode).\n" + type: "string" + cache: + oneOf: + - description: "The `cache` directive allows you to store the process results\ + \ to a local cache. When the cache is enabled and the pipeline is launched\ + \ with the resume option, any following attempt to execute the process,\ + \ along with the same inputs, will cause the process execution to be skipped,\ + \ producing the stored data as the actual results.\n\nThe caching feature\ + \ generates a unique key by indexing the process script and inputs. This\ + \ key is used to identify univocally the outputs produced by the process\ + \ execution.\n\nThe `cache` is enabled by default, you can disable it\ + \ for a specific process by setting the cache directive to `false`.\n\n\ + Accepted values are: `true`, `false`, \"deep\", and \"lenient\".\n\nSee\ + \ [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" + type: "boolean" + - description: "The `cache` directive allows you to store the process results\ + \ to a local cache. When the cache is enabled and the pipeline is launched\ + \ with the resume option, any following attempt to execute the process,\ + \ along with the same inputs, will cause the process execution to be skipped,\ + \ producing the stored data as the actual results.\n\nThe caching feature\ + \ generates a unique key by indexing the process script and inputs. This\ + \ key is used to identify univocally the outputs produced by the process\ + \ execution.\n\nThe `cache` is enabled by default, you can disable it\ + \ for a specific process by setting the cache directive to `false`.\n\n\ + Accepted values are: `true`, `false`, \"deep\", and \"lenient\".\n\nSee\ + \ [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" + type: "string" + pod: + oneOf: + - description: "The `pod` directive allows the definition of pods specific\ + \ settings, such as environment variables, secrets and config maps when\ + \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" + type: "object" + additionalProperties: + type: "string" + - type: "array" + items: + description: "The `pod` directive allows the definition of pods specific\ + \ settings, such as environment variables, secrets and config maps when\ + \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" + type: "object" + additionalProperties: + type: "string" + penv: + description: "The `penv` directive allows you to define the parallel environment\ + \ to be used when submitting a parallel task to the SGE resource manager.\n\ + \nSee [`penv`](https://www.nextflow.io/docs/latest/process.html#penv).\n" + type: "string" + scratch: + oneOf: + - description: "The `scratch` directive allows you to execute the process\ + \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" + type: "boolean" + - description: "The `scratch` directive allows you to execute the process\ + \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" + type: "string" + storeDir: + description: "The `storeDir` directive allows you to define a directory that\ + \ is used as a permanent cache for your process results.\n\nSee [`storeDir`](https://www.nextflow.io/docs/latest/process.html#storeDir).\n" + type: "string" + maxRetries: + oneOf: + - description: "The `maxRetries` directive allows you to define the maximum\ + \ number of times a process instance can be re-submitted in case of failure.\ + \ This value is applied only when using the retry error strategy. By default\ + \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" + type: "string" + - description: "The `maxRetries` directive allows you to define the maximum\ + \ number of times a process instance can be re-submitted in case of failure.\ + \ This value is applied only when using the retry error strategy. By default\ + \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" + type: "integer" + echo: + oneOf: + - description: "By default the stdout produced by the commands executed in\ + \ all processes is ignored. By setting the `echo` directive to true, you\ + \ can forward the process stdout to the current top running process stdout\ + \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" + type: "boolean" + - description: "By default the stdout produced by the commands executed in\ + \ all processes is ignored. By setting the `echo` directive to true, you\ + \ can forward the process stdout to the current top running process stdout\ + \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" + type: "string" + errorStrategy: + description: "The `errorStrategy` directive allows you to define how an error\ + \ condition is managed by the process. By default when an error status is\ + \ returned by the executed script, the process stops immediately. This in\ + \ turn forces the entire pipeline to terminate.\n\nTable of available error\ + \ strategies:\n| Name | Executor |\n|------|----------|\n| `terminate` |\ + \ Terminates the execution as soon as an error condition is reported. Pending\ + \ jobs are killed (default) |\n| `finish` | Initiates an orderly pipeline\ + \ shutdown when an error condition is raised, waiting the completion of\ + \ any submitted job. |\n| `ignore` | Ignores processes execution errors.\ + \ |\n| `retry` | Re-submit for execution a process returning an error condition.\ + \ |\n\nSee [`errorStrategy`](https://www.nextflow.io/docs/latest/process.html#errorstrategy).\n" + type: "string" + memory: + description: "The `memory` directive allows you to define how much memory\ + \ the process is allowed to use.\n\nSee [`memory`](https://www.nextflow.io/docs/latest/process.html#memory).\n" + type: "string" + stageOutMode: + description: "The `stageOutMode` directive defines how output files are staged-out\ + \ from the scratch directory to the process work directory. The following\ + \ values are allowed:\n\n| Value | Description |\n|-------|-------------|\ + \ \n| copy | Output files are copied from the scratch directory to the work\ + \ directory. | \n| move | Output files are moved from the scratch directory\ + \ to the work directory. | \n| rsync | Output files are copied from the\ + \ scratch directory to the work directory by using the rsync utility. |\n\ + \nSee [`stageOutMode`](https://www.nextflow.io/docs/latest/process.html#stageoutmode).\n" + type: "string" + required: [] + additionalProperties: false + NextflowAuto: + description: "Automated processing flags which can be toggled on or off." + type: "object" + properties: + simplifyInput: + description: "If `true`, an input tuple only containing only a single File\ + \ (e.g. `[\"foo\", file(\"in.h5ad\")]`) is automatically transformed to\ + \ a map (i.e. `[\"foo\", [ input: file(\"in.h5ad\") ] ]`).\n\nDefault: `true`.\n" + type: "boolean" + simplifyOutput: + description: "If `true`, an output tuple containing a map with a File (e.g.\ + \ `[\"foo\", [ output: file(\"out.h5ad\") ] ]`) is automatically transformed\ + \ to a map (i.e. `[\"foo\", file(\"out.h5ad\")]`).\n\nDefault: `false`.\n" + type: "boolean" + publish: + oneOf: + - description: "If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`.\nIf equal to \"state\", also a `.state.yaml`\ + \ file will be published in the publish dir.\nWill throw an error if `params.publishDir`\ + \ is not defined.\n\nDefault: `false`.\n" + type: "boolean" + - description: "If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`.\nIf equal to \"state\", also a `.state.yaml`\ + \ file will be published in the publish dir.\nWill throw an error if `params.publishDir`\ + \ is not defined.\n\nDefault: `false`.\n" + type: "string" + transcript: + description: "If `true`, the module's transcripts from `work/` are automatically\ + \ published to `params.transcriptDir`.\nIf not defined, `params.publishDir\ + \ + \"/_transcripts\"` will be used.\nWill throw an error if neither are\ + \ defined.\n\nDefault: `false`.\n" + type: "boolean" + required: [] + additionalProperties: false + NextflowConfig: + description: "Allows tweaking how the Nextflow Config file is generated." + type: "object" + properties: + labels: + description: "A series of default labels to specify memory and cpu constraints.\n\ + \nThe default memory labels are defined as \"mem1gb\", \"mem2gb\", \"mem4gb\"\ + , ... upto \"mem512tb\" and follows powers of 2.\nThe default cpu labels\ + \ are defined as \"cpu1\", \"cpu2\", \"cpu5\", \"cpu10\", ... upto \"cpu1000\"\ + \ and follows a semi logarithmic scale (1, 2, 5 per decade).\n\nConceptually\ + \ it is possible for a Viash Config to overwrite the full labels parameter,\ + \ however likely it is more efficient to add additional labels\nin the Viash\ + \ Package with a config mod.\n" + type: "object" + additionalProperties: + type: "string" + script: + oneOf: + - description: "Includes a single string or list of strings into the nextflow.config\ + \ file.\nThis can be used to add custom profiles or include an additional\ + \ config file.\n" + type: "string" + - type: "array" + items: + description: "Includes a single string or list of strings into the nextflow.config\ + \ file.\nThis can be used to add custom profiles or include an additional\ + \ config file.\n" + type: "string" + required: [] + additionalProperties: false + Dependency: + description: "Specifies a Viash component (script or executable) that should be\ + \ made available for the code defined in the component.\nThe dependency components\ + \ are collected and copied to the output folder during the Viash build step.\n" + type: "object" + properties: + name: + description: "The full name of the dependency component. This should include\ + \ the namespace." + type: "string" + repository: + oneOf: + - description: "Specifies the repository location where the dependency component\ + \ can be found.\nThis must either be a full definition of the repository\ + \ or the name of a repository referenced as it is defined under repositories.\n\ + Additionally, the full definition can be specified as a single string\ + \ where all parameters such as repository type, url, branch or tag are\ + \ specified.\nOmitting the value sets the dependency as a local dependency,\ + \ ie. the dependency is available in the same namespace as the component.\n" + type: "string" + - description: "Specifies the repository location where the dependency component\ + \ can be found.\nThis must either be a full definition of the repository\ + \ or the name of a repository referenced as it is defined under repositories.\n\ + Additionally, the full definition can be specified as a single string\ + \ where all parameters such as repository type, url, branch or tag are\ + \ specified.\nOmitting the value sets the dependency as a local dependency,\ + \ ie. the dependency is available in the same namespace as the component.\n" + $ref: "#/definitions/Repository" + alias: + description: "An alternative name for the dependency component. This can include\ + \ a namespace if so needed." + type: "string" + required: + - "name" + additionalProperties: false + Repository: + oneOf: + - $ref: "#/definitions/LocalRepository" + - $ref: "#/definitions/GitRepository" + - $ref: "#/definitions/GithubRepository" + - $ref: "#/definitions/ViashhubRepository" + LocalRepository: + description: "Defines a locally present and available repository.\nThis can be\ + \ used to define components from the same code base as the current component.\n\ + Alternatively, this can be used to refer to a code repository present on the\ + \ local hard-drive instead of fetchable remotely, for example during development.\n" + type: "object" + properties: + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + type: + description: "Defines a locally present and available repository.\nThis can\ + \ be used to define components from the same code base as the current component.\n\ + Alternatively, this can be used to refer to a code repository present on\ + \ the local hard-drive instead of fetchable remotely, for example during\ + \ development.\n" + const: "local" + required: + - "type" + additionalProperties: false + GitRepository: + description: "A Git repository where remote dependency components can be found." + type: "object" + properties: + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + uri: + description: "The URI of the Git repository." + type: "string" + type: + description: "A Git repository where remote dependency components can be found." + const: "git" + required: + - "uri" + - "type" + additionalProperties: false + GithubRepository: + description: "A GitHub repository where remote dependency components can be found." + type: "object" + properties: + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + repo: + description: "The name of the GitHub repository." + type: "string" + type: + description: "A GitHub repository where remote dependency components can be\ + \ found." + const: "github" + required: + - "repo" + - "type" + additionalProperties: false + ViashhubRepository: + description: "A Viash-Hub repository where remote dependency components can be\ + \ found." + type: "object" + properties: + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + repo: + description: "The name of the Viash-Hub repository." + type: "string" + type: + description: "A Viash-Hub repository where remote dependency components can\ + \ be found." + const: "viashhub" + required: + - "repo" + - "type" + additionalProperties: false + RepositoryWithName: + oneOf: + - $ref: "#/definitions/LocalRepositoryWithName" + - $ref: "#/definitions/GitRepositoryWithName" + - $ref: "#/definitions/GithubRepositoryWithName" + - $ref: "#/definitions/ViashhubRepositoryWithName" + LocalRepositoryWithName: + description: "Defines a locally present and available repository.\nThis can be\ + \ used to define components from the same code base as the current component.\n\ + Alternatively, this can be used to refer to a code repository present on the\ + \ local hard-drive instead of fetchable remotely, for example during development.\n" + type: "object" + properties: + name: + description: "The identifier used to refer to this repository from dependencies." + type: "string" + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + type: + description: "Defines a locally present and available repository.\nThis can\ + \ be used to define components from the same code base as the current component.\n\ + Alternatively, this can be used to refer to a code repository present on\ + \ the local hard-drive instead of fetchable remotely, for example during\ + \ development.\n" + const: "local" + required: + - "name" + - "type" + additionalProperties: false + GitRepositoryWithName: + description: "A Git repository where remote dependency components can be found." + type: "object" + properties: + name: + description: "The identifier used to refer to this repository from dependencies." + type: "string" + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + uri: + description: "The URI of the Git repository." + type: "string" + type: + description: "A Git repository where remote dependency components can be found." + const: "git" + required: + - "name" + - "uri" + - "type" + additionalProperties: false + GithubRepositoryWithName: + description: "A GitHub repository where remote dependency components can be found." + type: "object" + properties: + name: + description: "The identifier used to refer to this repository from dependencies." + type: "string" + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + repo: + description: "The name of the GitHub repository." + type: "string" + type: + description: "A GitHub repository where remote dependency components can be\ + \ found." + const: "github" + required: + - "name" + - "repo" + - "type" + additionalProperties: false + ViashhubRepositoryWithName: + description: "A Viash-Hub repository where remote dependency components can be\ + \ found." + type: "object" + properties: + name: + description: "The identifier used to refer to this repository from dependencies." + type: "string" + path: + description: "Defines a subfolder of the repository to use as base to look\ + \ for the dependency components." + type: "string" + tag: + description: "Defines which version of the dependency component to use. Typically\ + \ this can be a specific tag, branch or commit hash." + type: "string" + repo: + description: "The name of the Viash-Hub repository." + type: "string" + type: + description: "A Viash-Hub repository where remote dependency components can\ + \ be found." + const: "viashhub" + required: + - "name" + - "repo" + - "type" + additionalProperties: false + DockerSetupStrategy: + enum: + - "cb" + - "ifneedbepullelsecachedbuild" + - "donothing" + - "gentlepush" + - "alwayspullelsebuild" + - "build" + - "alwayspull" + - "alwaysbuild" + - "ifneedbebuild" + - "pullelsebuild" + - "p" + - "alwayspullelsecachedbuild" + - "pull" + - "maybepush" + - "ifneedbepullelsebuild" + - "cachedbuild" + - "pullelsecachedbuild" + - "push" + - "forcepush" + - "alwayspush" + - "b" + - "pushifnotpresent" + - "alwayscachedbuild" + - "meh" + - "ifneedbepull" + - "ifneedbecachedbuild" + $comment: "TODO add descriptions to different strategies" + description: "The Docker setup strategy to use when building a container." + Direction: + enum: + - "input" + - "output" + description: "Makes this argument an `input` or an `output`, as in does the file/folder\ + \ needs to be read or written. `input` by default." + Status: + enum: + - "enabled" + - "disabled" + - "deprecated" + description: "Allows setting a component to active, deprecated or disabled." + DockerResolveVolume: + enum: + - "manual" + - "automatic" + - "auto" + - "Manual" + - "Automatic" + - "Auto" + $comment: "TODO make fully case insensitive" + description: "Enables or disables automatic volume mapping. Enabled when set to\ + \ `Automatic` or disabled when set to `Manual`. Default: `Automatic`" + DoubleStrings: + enum: + - "+.inf" + - "+inf" + - "+infinity" + - "positiveinfinity" + - "positiveinf" + - "-.inf" + - "-inf" + - "-infinity" + - "negativeinfinity" + - "negativeinf" + - ".nan" + - "nan" + DoubleWithInf: + oneOf: + - type: "number" + - $ref: "#/definitions/DoubleStrings" +oneOf: +- $ref: "#/definitions/Config" + diff --git a/common/schemas/script.sh b/common/schemas/script.sh new file mode 100755 index 0000000..e54e01a --- /dev/null +++ b/common/schemas/script.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +VIASH_VERSION=0.9.0 viash export json_schema -f yaml > schemas/schema_viash.yaml \ No newline at end of file diff --git a/common/schemas/task_config.yaml b/common/schemas/task_config.yaml new file mode 100644 index 0000000..a77153f --- /dev/null +++ b/common/schemas/task_config.yaml @@ -0,0 +1,94 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: "Task configuration" +description: "A Viash package configuration file. It's name should be `_viash.yaml`." +type: "object" +properties: + organization: + description: "The organization of the package." + type: "string" + name: + description: "The name of the package." + type: "string" + source: + description: "Which source directory to use for the `viash ns` commands." + type: "string" + description: + description: "A description of the package. This is only used for documentation.\ + \ Multiline descriptions are supported." + type: "string" + config_mods: + oneOf: + - description: "Which config mods to apply." + type: "string" + - type: "array" + items: + description: "Which config mods to apply." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + required: ["image"] + properties: + image: + description: "Path to a thumbnail for this OpenProblems task. Should be an *.svg file renderable to a 450x150 px image." + type: "string" + test_resources: + description: "A list of test resources that are required to run the tests." + type: "array" + items: + type: "object" + required: ["type", "path", "dest"] + additionalProperties: false + properties: + type: + description: "The type of the test resource." + const: "s3" + path: + description: "The path to the test resource." + type: "string" + dest: + description: "The destination path of the test resource." + type: "string" + license: + description: "The license of the package." + type: "string" + references: + description: "References to external resources related to the package." + $ref: "schema_viash.yaml#/definitions/References" + authors: + description: "The authors of the package." + type: "array" + items: + $ref: "schema_viash.yaml#/definitions/Author" + repositories: + description: "Common repository definitions for component dependencies." + type: "array" + items: + $ref: "schema_viash.yaml#/definitions/RepositoryWithName" + keywords: + description: "The keywords of the package." + type: "array" + items: + type: "string" + target: + description: "Which target directory to use for `viash ns build`." + type: "string" + summary: + description: "A one-sentence summary of the package. This is only used for\ + \ documentation." + type: "string" + viash_version: + description: "Which version of Viash to use." + type: "string" + label: + description: "A clean version of the package name. This is only used for documentation." + type: "string" + version: + description: "The version of the package." + type: "string" + links: + description: "External links of the package." + $ref: "schema_viash.yaml#/definitions/Links" +required: [viash_version, name, organization, license, label, keywords, summary, description, authors, links, info] +additionalProperties: false diff --git a/common/schemas/task_control_method.yaml b/common/schemas/task_control_method.yaml new file mode 100644 index 0000000..7baeaa1 --- /dev/null +++ b/common/schemas/task_control_method.yaml @@ -0,0 +1,71 @@ +$schema: "http://json-schema.org/draft-07/schema#" +title: Control Method +description: | + A control method is used to test the relative performance of all other methods, + and also as a quality control for the pipeline as a whole. A control method can + either be a positive control or a negative control. The positive control and + negative control methods set a maximum and minimum threshold for performance, + so any new method should perform better than the negative control methods and + worse than the positive control method. +type: object +required: [__merge__, name, label, summary, description, info, resources, runners] +properties: + __merge__: + "$ref": "schema_openproblems_definitions.yaml#/definitions/CompAPIMerge" + name: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Name" + status: + "$ref": "schema_viash.yaml#/definitions/Status" + label: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Label" + summary: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Summary" + description: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Description" + references: + "$ref": "schema_viash.yaml#/definitions/References" + links: + "$ref": "schema_viash.yaml#/definitions/Links" + info: + type: object + description: Metadata of the component. + properties: + preferred_normalization: + "$ref": "schema_openproblems_definitions.yaml#/definitions/PreferredNormalization" + variants: + "$ref": "schema_openproblems_definitions.yaml#/definitions/MethodVariants" + arguments: + type: array + description: Component-specific parameters. + items: + "$ref": "schema_viash.yaml#/definitions/Argument" + argument_groups: + type: array + description: Component-specific parameter groups. + items: + "$ref": "schema_viash.yaml#/definitions/ArgumentGroup" + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + engines: + type: array + description: "A list of engine environments to execute target artifacts in.\n\ + \n - NativeEngine\n - DockerEngine\n" + items: + anyOf: + - "$ref": "schema_openproblems_definitions.yaml#/definitions/DockerEngine" + runners: + type: array + description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ + \ - NextflowRunner\n" + items: + anyOf: + - "$ref": "schema_openproblems_definitions.yaml#/definitions/NextflowRunner" + - "$ref": "schema_openproblems_definitions.yaml#/definitions/ExecutableRunner" diff --git a/common/schemas/task_method.yaml b/common/schemas/task_method.yaml new file mode 100644 index 0000000..a9c8ca7 --- /dev/null +++ b/common/schemas/task_method.yaml @@ -0,0 +1,68 @@ +$schema: "http://json-schema.org/draft-07/schema#" +title: Method +description: | + A method is a specific technique used to solve the task problem and is + compared to the control methods and other methods to determine the best + approach for the task depending on the type of dataset. +type: object +required: [__merge__, name, label, summary, description, info, resources, references, links, runners] +properties: + __merge__: + "$ref": "schema_openproblems_definitions.yaml#/definitions/CompAPIMerge" + name: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Name" + status: + "$ref": "schema_viash.yaml#/definitions/Status" + label: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Label" + summary: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Summary" + description: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Description" + references: + "$ref": "schema_viash.yaml#/definitions/References" + links: + "$ref": "schema_viash.yaml#/definitions/Links" + info: + type: object + description: Metadata of the component. + properties: + preferred_normalization: + "$ref": "schema_openproblems_definitions.yaml#/definitions/PreferredNormalization" + variants: + "$ref": "schema_openproblems_definitions.yaml#/definitions/MethodVariants" + arguments: + type: array + description: Component-specific parameters. + items: + "$ref": "schema_viash.yaml#/definitions/Argument" + argument_groups: + type: array + description: Component-specific parameter groups. + items: + "$ref": "schema_viash.yaml#/definitions/ArgumentGroup" + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + engines: + type: array + description: "A list of engine environments to execute target artifacts in.\n\ + \n - NativeEngine\n - DockerEngine\n" + items: + anyOf: + - "$ref": "schema_openproblems_definitions.yaml#/definitions/DockerEngine" + runners: + type: array + description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ + \ - NextflowRunner\n" + items: + anyOf: + - "$ref": "schema_openproblems_definitions.yaml#/definitions/NextflowRunner" + - "$ref": "schema_openproblems_definitions.yaml#/definitions/ExecutableRunner" diff --git a/common/schemas/task_metric.yaml b/common/schemas/task_metric.yaml new file mode 100644 index 0000000..7ff9b0d --- /dev/null +++ b/common/schemas/task_metric.yaml @@ -0,0 +1,96 @@ +$schema: "http://json-schema.org/draft-07/schema#" +title: Method +description: | + A method is a specific technique used to solve the task problem and is + compared to the control methods and other methods to determine the best + approach for the task depending on the type of dataset. +type: object +required: [__merge__, name, info, resources, runners] +properties: + __merge__: + "$ref": "schema_openproblems_definitions.yaml#/definitions/CompAPIMerge" + name: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Name" + status: + "$ref": "schema_viash.yaml#/definitions/Status" + label: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Label" + summary: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Summary" + description: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Description" + info: + type: object + description: Metadata of the component. + properties: + preferred_normalization: + "$ref": "schema_openproblems_definitions.yaml#/definitions/PreferredNormalization" + metrics: + type: array + minItems: 1 + items: + type: object + description: Metadata of each metric. + additionalProperties: false + required: [label, summary, description, references, links, min, max, maximize] + properties: + name: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Name" + label: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Label" + summary: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Summary" + description: + "$ref": "schema_openproblems_definitions.yaml#/definitions/Description" + references: + "$ref": "schema_viash.yaml#/definitions/References" + links: + "$ref": "schema_viash.yaml#/definitions/Links" + min: + description: The lowest possible value of the metric. + oneOf: + - type: number + - const: "-.inf" + max: + description: The highest possible value of the metric. + oneOf: + - type: number + - const: "+.inf" + maximize: + type: boolean + description: Whether a higher metric value is better. + arguments: + type: array + description: Component-specific parameters. + items: + "$ref": "schema_viash.yaml#/definitions/Argument" + argument_groups: + type: array + description: Component-specific parameter groups. + items: + "$ref": "schema_viash.yaml#/definitions/ArgumentGroup" + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "schema_viash.yaml#/definitions/Resource" + engines: + type: array + description: "A list of engine environments to execute target artifacts in.\n\ + \n - NativeEngine\n - DockerEngine\n" + items: + anyOf: + - "$ref": "schema_openproblems_definitions.yaml#/definitions/DockerEngine" + runners: + type: array + description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ + \ - NextflowRunner\n" + items: + anyOf: + - "$ref": "schema_openproblems_definitions.yaml#/definitions/NextflowRunner" + - "$ref": "schema_openproblems_definitions.yaml#/definitions/ExecutableRunner" diff --git a/common/scripts/create_component b/common/scripts/create_component new file mode 100755 index 0000000..59eff50 --- /dev/null +++ b/common/scripts/create_component @@ -0,0 +1,40 @@ +#!/bin/bash + +repository="openproblems-bio/core" +version="f1793a4dd9f5e0aaa25745ae644f1172cb3fc496" +component="project/create_component" + +cache_dir="$HOME/.cache/openproblems/repos/$repository" +executable_path="viash/core/target/executable/${component}/$(basename ${component})" +version_file="$HOME/.cache/openproblems/versions/${repository}/${component}/version.txt" + +function get_repository { + if [ ! -d "$cache_dir" ]; then + mkdir -p "$cache_dir" + git clone https://github.com/${repository}.git "$cache_dir" > /dev/null + fi +} + +function update_repository { + (cd "$cache_dir" && \ + git fetch > /dev/null 2>&1 && \ + git checkout "$version" > /dev/null 2>&1 && \ + git pull > /dev/null 2>&1) +} + +function pull_image_if_version_changed { + if [ ! -d "$(dirname $version_file)" ]; then + mkdir -p "$(dirname $version_file)" + fi + if [ ! -f "$version_file" ] || [ "$(cat $version_file)" != "$version" ]; then + "${cache_dir}/${executable_path}" ---setup pull + echo "$version" > "$version_file" + fi +} + +get_repository +update_repository +pull_image_if_version_changed + +# run the executable +"${cache_dir}/${executable_path}" $@ diff --git a/common/scripts/create_task_readme b/common/scripts/create_task_readme new file mode 100755 index 0000000..5588594 --- /dev/null +++ b/common/scripts/create_task_readme @@ -0,0 +1,40 @@ +#!/bin/bash + +repository="openproblems-bio/core" +version="26382c1d4934f41b9bd301cffb9911ef5ba75a8e" +component="project/create_task_readme" + +cache_dir="$HOME/.cache/openproblems/repos/$repository" +executable_path="viash/core/target/executable/${component}/$(basename ${component})" +version_file="$HOME/.cache/openproblems/versions/${repository}/${component}/version.txt" + +function get_repository { + if [ ! -d "$cache_dir" ]; then + mkdir -p "$cache_dir" + git clone https://github.com/${repository}.git "$cache_dir" > /dev/null + fi +} + +function update_repository { + (cd "$cache_dir" && \ + git fetch > /dev/null 2>&1 && \ + git checkout "$version" > /dev/null 2>&1 && \ + git pull > /dev/null 2>&1) +} + +function pull_image_if_version_changed { + if [ ! -d "$(dirname $version_file)" ]; then + mkdir -p "$(dirname $version_file)" + fi + if [ ! -f "$version_file" ] || [ "$(cat $version_file)" != "$version" ]; then + "${cache_dir}/${executable_path}" ---setup pull + echo "$version" > "$version_file" + fi +} + +get_repository +update_repository +pull_image_if_version_changed + +# run the executable +"${cache_dir}/${executable_path}" $@ diff --git a/common/scripts/fetch_task_run b/common/scripts/fetch_task_run new file mode 100755 index 0000000..37e8652 --- /dev/null +++ b/common/scripts/fetch_task_run @@ -0,0 +1,40 @@ +#!/bin/bash + +repository="openproblems-bio/core" +version="fb6a86d18870f7cc771bf9db764d56aa0969f2c5" +component="project/fetch_task_run" + +cache_dir="$HOME/.cache/openproblems/repos/$repository" +executable_path="viash/core/target/executable/${component}/$(basename ${component})" +version_file="$HOME/.cache/openproblems/versions/${repository}/${component}/version.txt" + +function get_repository { + if [ ! -d "$cache_dir" ]; then + mkdir -p "$cache_dir" + git clone https://github.com/${repository}.git "$cache_dir" > /dev/null + fi +} + +function update_repository { + (cd "$cache_dir" && \ + git fetch > /dev/null 2>&1 && \ + git checkout "$version" > /dev/null 2>&1 && \ + git pull > /dev/null 2>&1) +} + +function pull_image_if_version_changed { + if [ ! -d "$(dirname $version_file)" ]; then + mkdir -p "$(dirname $version_file)" + fi + if [ ! -f "$version_file" ] || [ "$(cat $version_file)" != "$version" ]; then + "${cache_dir}/${executable_path}" ---setup pull + echo "$version" > "$version_file" + fi +} + +get_repository +update_repository +pull_image_if_version_changed + +# run the executable +"${cache_dir}/${executable_path}" $@ diff --git a/common/scripts/sync_resources b/common/scripts/sync_resources new file mode 100755 index 0000000..c1c1bb8 --- /dev/null +++ b/common/scripts/sync_resources @@ -0,0 +1,40 @@ +#!/bin/bash + +repository="openproblems-bio/core" +version="f1793a4dd9f5e0aaa25745ae644f1172cb3fc496" +component="project/sync_resources" + +cache_dir="$HOME/.cache/openproblems/repos/$repository" +executable_path="viash/core/target/executable/${component}/$(basename ${component})" +version_file="$HOME/.cache/openproblems/versions/${repository}/${component}/version.txt" + +function get_repository { + if [ ! -d "$cache_dir" ]; then + mkdir -p "$cache_dir" + git clone https://github.com/${repository}.git "$cache_dir" > /dev/null + fi +} + +function update_repository { + (cd "$cache_dir" && \ + git fetch > /dev/null 2>&1 && \ + git checkout "$version" > /dev/null 2>&1 && \ + git pull > /dev/null 2>&1) +} + +function pull_image_if_version_changed { + if [ ! -d "$(dirname $version_file)" ]; then + mkdir -p "$(dirname $version_file)" + fi + if [ ! -f "$version_file" ] || [ "$(cat $version_file)" != "$version" ]; then + "${cache_dir}/${executable_path}" ---setup pull + echo "$version" > "$version_file" + fi +} + +get_repository +update_repository +pull_image_if_version_changed + +# run the executable +"${cache_dir}/${executable_path}" $@ diff --git a/common/scripts/upgrade_config b/common/scripts/upgrade_config new file mode 100755 index 0000000..c0124e9 --- /dev/null +++ b/common/scripts/upgrade_config @@ -0,0 +1,40 @@ +#!/bin/bash + +repository="openproblems-bio/core" +version="f1793a4dd9f5e0aaa25745ae644f1172cb3fc496" +component="project/upgrade_config" + +cache_dir="$HOME/.cache/openproblems/repos/$repository" +executable_path="viash/core/target/executable/${component}/$(basename ${component})" +version_file="$HOME/.cache/openproblems/versions/${repository}/${component}/version.txt" + +function get_repository { + if [ ! -d "$cache_dir" ]; then + mkdir -p "$cache_dir" + git clone https://github.com/${repository}.git "$cache_dir" > /dev/null + fi +} + +function update_repository { + (cd "$cache_dir" && \ + git fetch > /dev/null 2>&1 && \ + git checkout "$version" > /dev/null 2>&1 && \ + git pull > /dev/null 2>&1) +} + +function pull_image_if_version_changed { + if [ ! -d "$(dirname $version_file)" ]; then + mkdir -p "$(dirname $version_file)" + fi + if [ ! -f "$version_file" ] || [ "$(cat $version_file)" != "$version" ]; then + "${cache_dir}/${executable_path}" ---setup pull + echo "$version" > "$version_file" + fi +} + +get_repository +update_repository +pull_image_if_version_changed + +# run the executable +"${cache_dir}/${executable_path}" $@ diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..62f0140 --- /dev/null +++ b/main.nf @@ -0,0 +1,3 @@ +workflow { + print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.") +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..8fc6c4e --- /dev/null +++ b/nextflow.config @@ -0,0 +1 @@ +process.container = 'nextflow/bash:latest' \ No newline at end of file diff --git a/scripts/create_component/.gitignore b/scripts/create_component/.gitignore new file mode 100644 index 0000000..09380f9 --- /dev/null +++ b/scripts/create_component/.gitignore @@ -0,0 +1,2 @@ +# if users change the scripts, the changes should not be committed. +/create_*_*.sh \ No newline at end of file diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh new file mode 100755 index 0000000..b43731f --- /dev/null +++ b/scripts/create_readme.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +common/scripts/create_task_readme --input src/api \ No newline at end of file diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh new file mode 100755 index 0000000..ae15999 --- /dev/null +++ b/scripts/create_resources/resources.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources/datasets/**/log_cp10k/state.yaml +rename_keys: 'input:output_dataset' +settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/denoising/datasets +HERE + +tw launch https://github.com/openproblems-bio/task_denoising.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels denoising,process_datasets diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh new file mode 100755 index 0000000..4711d79 --- /dev/null +++ b/scripts/create_resources/test_resources.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/task_denoising + +mkdir -p $DATASET_DIR + +# process dataset +viash run src/data_processors/process_dataset/config.vsh.yaml -- \ + --input $RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad \ + --output_train $DATASET_DIR/cxg_immune_cell_atlas/train.h5ad \ + --output_test $DATASET_DIR/cxg_immune_cell_atlas/test.h5ad + +# run one method +viash run src/methods/magic/config.vsh.yaml -- \ + --input_train $DATASET_DIR/cxg_immune_cell_atlas/train.h5ad \ + --output $DATASET_DIR/cxg_immune_cell_atlas/denoised.h5ad + +# run one metric +viash run src/metrics/poisson/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/cxg_immune_cell_atlas/denoised.h5ad \ + --input_test $DATASET_DIR/cxg_immune_cell_atlas/test.h5ad \ + --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad + +# write manual state.yaml. this is not actually necessary but you never know it might be useful +cat > $DATASET_DIR/cxg_immune_cell_atlas/state.yaml << HERE +id: cxg_immune_cell_atlas +train: !file train.h5ad +test: !file test.h5ad +prediction: !file denoised.h5ad +score: !file score.h5ad +HERE + +# only run this if you have access to the openproblems-data bucket +aws s3 sync --profile op \ + "$DATASET_DIR" s3://openproblems-data/resources_test/task_denoising \ + --delete --dryrun diff --git a/scripts/project/build_all_components.sh b/scripts/project/build_all_components.sh new file mode 100755 index 0000000..4e90d91 --- /dev/null +++ b/scripts/project/build_all_components.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e + +# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html) +viash ns build --parallel diff --git a/scripts/project/build_all_docker_containers.sh b/scripts/project/build_all_docker_containers.sh new file mode 100755 index 0000000..5d43639 --- /dev/null +++ b/scripts/project/build_all_docker_containers.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e + +# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html) +# and set up the container via a cached build +viash ns build --parallel --setup cachedbuild diff --git a/scripts/project/test_all_components.sh b/scripts/project/test_all_components.sh new file mode 100755 index 0000000..8a08afd --- /dev/null +++ b/scripts/project/test_all_components.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e + +# Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html) +viash ns test --parallel diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh new file mode 100755 index 0000000..16554a3 --- /dev/null +++ b/scripts/run_benchmark/run_full_local.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# NOTE: depending on the the datasets and components, you may need to launch this workflow +# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud). +# please refer to the nextflow information for more details: +# https://www.nextflow.io/docs/latest/ + + +set -e + +echo "Running benchmark on test data" +echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" + +# generate a unique id +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="resources/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: resources/datasets/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +# run the benchmark +nextflow run openproblems-bio/task_denoising \ + --revision build/main \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + -params-file /tmp/params.yaml diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh new file mode 100755 index 0000000..5a26e67 --- /dev/null +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# generate a unique id +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/task_denoising/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/task_denoising/datasets/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_denoising.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_denoising,full \ No newline at end of file diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh new file mode 100755 index 0000000..0d01b0f --- /dev/null +++ b/scripts/run_benchmark/run_test_local.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +echo "Running benchmark on test data" +echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" + +# generate a unique id +RUN_ID="testrun_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="temp/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: resources_test/task_denoising/**/state.yaml +rename_keys: 'input_train:train;input_test:test' +output_state: "state.yaml" +publish_dir: "$publish_dir" +settings: '{"methods_exclude": ["scprint"]}' +HERE + +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + -params-file /tmp/params.yaml diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh new file mode 100755 index 0000000..a728475 --- /dev/null +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +resources_test_s3=s3://openproblems-data/resources_test/task_denoising +publish_dir_s3="s3://openproblems-nextflow/temp/results/task_denoising/$(date +%Y-%m-%d_%H-%M-%S)" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +id: cxg_immune_cell_atlas +input_train: $resources_test_s3/cxg_immune_cell_atlas/train.h5ad +input_test: $resources_test_s3/cxg_immune_cell_atlas/test.h5ad +output_state: "state.yaml" +publish_dir: $publish_dir_s3 +HERE + +tw launch https://github.com/openproblems-bio/task_denoising.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_denoising,test diff --git a/scripts/sync_resources.sh b/scripts/sync_resources.sh new file mode 100755 index 0000000..20b87e7 --- /dev/null +++ b/scripts/sync_resources.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +common/scripts/sync_resources diff --git a/src/api/base_method.yaml b/src/api/base_method.yaml new file mode 100644 index 0000000..07d7481 --- /dev/null +++ b/src/api/base_method.yaml @@ -0,0 +1,20 @@ +namespace: "methods" +info: + type: method + type_info: + label: Method + summary: A method. + description: | + A denoising method to remove noise (i.e. technical artifacts) from a dataset. +arguments: + - name: --input_train + __merge__: file_train.yaml + required: true + direction: input + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/check_config.py diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml new file mode 100644 index 0000000..2988eb5 --- /dev/null +++ b/src/api/comp_control_method.yaml @@ -0,0 +1,33 @@ +namespace: control_methods +info: + type: control_method + type_info: + label: Control Method + summary: A control method. + description: | + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. +arguments: + - name: --input_train + __merge__: file_train.yaml + required: true + direction: input + - name: --input_test + __merge__: file_test.yaml + required: true + direction: input + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /common/library.bib + - path: /resources_test/task_denoising/cxg_immune_cell_atlas + dest: resources_test/task_denoising/cxg_immune_cell_atlas diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml new file mode 100644 index 0000000..f2dcb66 --- /dev/null +++ b/src/api/comp_data_processor.yaml @@ -0,0 +1,26 @@ +namespace: "data_processors" +info: + type: process_dataset + type_info: + label: Data processor + summary: A denoising dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/common/cxg_immune_cell_atlas + dest: resources_test/common/cxg_immune_cell_atlas diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml new file mode 100644 index 0000000..da7c11d --- /dev/null +++ b/src/api/comp_method.yaml @@ -0,0 +1,9 @@ +__merge__: base_method.yaml +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /common/library.bib + - path: /resources_test/task_denoising/cxg_immune_cell_atlas + dest: resources_test/task_denoising/cxg_immune_cell_atlas diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml new file mode 100644 index 0000000..e113937 --- /dev/null +++ b/src/api/comp_metric.yaml @@ -0,0 +1,29 @@ +namespace: "metrics" +info: + type: metric + type_info: + label: Metric + summary: A metric. + description: | + A metric for evaluating denoised datasets. +arguments: + - name: "--input_test" + __merge__: file_test.yaml + direction: input + required: true + - name: "--input_prediction" + __merge__: file_prediction.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/check_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /common/library.bib + - path: /resources_test/task_denoising/cxg_immune_cell_atlas + dest: resources_test/task_denoising/cxg_immune_cell_atlas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml new file mode 100644 index 0000000..57ff616 --- /dev/null +++ b/src/api/file_common_dataset.yaml @@ -0,0 +1,47 @@ +type: file +example: "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" +label: "Common Dataset" +summary: A subset of the common dataset. +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: false + + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml new file mode 100644 index 0000000..f48a4b3 --- /dev/null +++ b/src/api/file_prediction.yaml @@ -0,0 +1,21 @@ +type: file +example: "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" +label: "Denoised data" +summary: A denoised dataset as output by a method. +info: + format: + type: h5ad + layers: + - type: integer + name: denoised + description: denoised data + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml new file mode 100644 index 0000000..bda2ede --- /dev/null +++ b/src/api/file_score.yaml @@ -0,0 +1,26 @@ +type: file +example: resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad +label: Score +summary: "File indicating the score of a metric." +info: + format: + type: h5ad + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/api/file_test.yaml b/src/api/file_test.yaml new file mode 100644 index 0000000..b6db758 --- /dev/null +++ b/src/api/file_test.yaml @@ -0,0 +1,45 @@ +type: file +example: "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" +label: "Test data" +summary: The subset of molecules used for the test dataset +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: train_sum + type: integer + description: The total number of counts in the training dataset. + required: true diff --git a/src/api/file_train.yaml b/src/api/file_train.yaml new file mode 100644 index 0000000..fc7ba7c --- /dev/null +++ b/src/api/file_train.yaml @@ -0,0 +1,21 @@ +type: file +example: "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" +label: "Training data" +summary: The subset of molecules used for the training dataset +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false diff --git a/src/control_methods/no_denoising/config.vsh.yaml b/src/control_methods/no_denoising/config.vsh.yaml new file mode 100644 index 0000000..5f0272a --- /dev/null +++ b/src/control_methods/no_denoising/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +name: "no_denoising" +label: No Denoising +summary: "negative control by copying train counts" +description: "This method serves as a negative control, where the denoised data is a copy of the unaltered training data. This represents the scoring threshold if denoising was not performed on the data." +info: + v1: + path: openproblems/tasks/denoising/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + no_denoising: + preferred_normalization: counts +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/no_denoising/script.py b/src/control_methods/no_denoising/script.py new file mode 100644 index 0000000..5d7dd38 --- /dev/null +++ b/src/control_methods/no_denoising/script.py @@ -0,0 +1,22 @@ +import anndata as ad + +## VIASH START +par = { + 'input_train': 'output_train.h5ad', + 'output': 'output_ND.h5ad', +} +meta = { + 'name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_train.layers['counts'] + +input_train.uns["method_id"] = meta['name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") diff --git a/src/control_methods/perfect_denoising/config.vsh.yaml b/src/control_methods/perfect_denoising/config.vsh.yaml new file mode 100644 index 0000000..47c3c5d --- /dev/null +++ b/src/control_methods/perfect_denoising/config.vsh.yaml @@ -0,0 +1,26 @@ +__merge__: ../../api/comp_control_method.yaml + +name: "perfect_denoising" +label: Perfect Denoising +summary: "Positive control by copying the test counts" +description: "This method serves as a positive control, where the test data is copied 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised as it will be compared to the test data in the metrics." +info: + v1: + path: openproblems/tasks/denoising/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + perfect_denoising: + preferred_normalization: counts +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/perfect_denoising/script.py b/src/control_methods/perfect_denoising/script.py new file mode 100644 index 0000000..2960ed4 --- /dev/null +++ b/src/control_methods/perfect_denoising/script.py @@ -0,0 +1,24 @@ +import anndata as ad + +## VIASH START +par = { + 'input_train': 'resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad', + 'input_test': 'resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad', + 'output': 'output_PD.h5ad', +} +meta = { + 'name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_test.layers['counts'] + +input_train.uns["method_id"] = meta['name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml new file mode 100644 index 0000000..abfbed6 --- /dev/null +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: ../../api/comp_data_processor.yaml +name: "process_dataset" +description: | + Split data using molecular cross-validation. + + Splits molecules into two (potentially overlapping) groups using a fraction ratio. + These are output as two separate AnnData objects. +arguments: + - name: "--method" + type: "string" + description: "The process method to assign train/test." + choices: ["mcv"] + default: "mcv" + - name: "--train_frac" + type: "double" + description: "The fraction the molecules need to be split to train dataset" + default: 0.9 + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: "--n_obs_limit" + type: "integer" + description: "The maximum number of cells the dataset may have before subsampling according to `obs.batch`." + default: 10000 +resources: + - type: python_script + path: script.py + - path: helper.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - numpy + - scipy +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/data_processors/process_dataset/helper.py b/src/data_processors/process_dataset/helper.py new file mode 100644 index 0000000..2044ed4 --- /dev/null +++ b/src/data_processors/process_dataset/helper.py @@ -0,0 +1,55 @@ +# MIT License + +# Copyright (c) 2019 Chan Zuckerberg Biohub + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Copied from https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/util.py + + +from typing import Tuple + +import numpy as np + +def split_molecules( + umis: np.ndarray, + data_split: float, + overlap_factor: float = 0.0, + random_state: np.random.RandomState = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Splits molecules into two (potentially overlapping) groups. + :param umis: Array of molecules to split + :param data_split: Proportion of molecules to assign to the first group + :param overlap_factor: Overlap correction factor, if desired + :param random_state: For reproducible sampling + :return: umis_X and umis_Y, representing ``split`` and ``~(1 - split)`` counts + sampled from the input array + """ + if random_state is None: + random_state = np.random.RandomState() + + umis_X_disjoint = random_state.binomial(umis, data_split - overlap_factor) + umis_Y_disjoint = random_state.binomial( + umis - umis_X_disjoint, (1 - data_split) / (1 - data_split + overlap_factor) + ) + overlap_factor = umis - umis_X_disjoint - umis_Y_disjoint + umis_X = umis_X_disjoint + overlap_factor + umis_Y = umis_Y_disjoint + overlap_factor + + return umis_X, umis_Y \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py new file mode 100644 index 0000000..8f3f5ac --- /dev/null +++ b/src/data_processors/process_dataset/script.py @@ -0,0 +1,100 @@ +import sys +import random +import anndata as ad +import numpy as np + +## VIASH START +par = { + 'input': "resources/datasets/openproblems_v1/pancreas/log_cp10k/dataset.h5ad", + 'output_train': "output/processed_datasets/train.h5ad", + 'output_test': "output/processed_datasets/test.h5ad", + 'train_frac': 0.9, + 'seed': 0, + 'n_obs_limit': 4000 +} +meta = { + "name": "process_dataset", + "resources_dir": "src/data_processors/process_dataset" +} +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from helper import split_molecules + +# set random state +random_state = np.random.RandomState(par['seed']) + +print(">> Load Data", flush=True) +adata = ad.read_h5ad(par["input"]) + +# limit to max number of observations +adata_output = adata.copy() + +if "batch" in adata.obs: + print(f">> Subsampling observations by largest batch", flush=True) + batch_counts = adata.obs.groupby('batch').size() + sorted_batches = batch_counts.sort_values(ascending=False) + selected_batch = sorted_batches.index[0] + adata_output = adata[adata.obs["batch"]==selected_batch,:].copy() + +if adata_output.n_obs > par["n_obs_limit"]: + print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True) + print(f">> Setting seed to {par['seed']}", flush=True) + random.seed(par["seed"]) + obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs) + obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False) + adata_output = adata_output[obs_index].copy() + +# remove all layers except for counts +print(">> Remove all layers except for counts", flush=True) +for key in list(adata_output.layers.keys()): + if key != "counts": + del adata_output.layers[key] + +# round counts and convert to int +print(">> Round counts and convert to int", flush=True) +counts = np.array(adata_output.layers["counts"]).round().astype(int) + +print(">> process and split data", flush=True) +train_data, test_data = split_molecules( + counts.data, par["train_frac"], 0.0, random_state +) + +X_train = counts.copy() +X_test = counts.copy() +X_train.data = train_data +X_test.data = test_data +X_train.eliminate_zeros() +X_test.eliminate_zeros() + +# copy adata to train_set, test_set +print(">> Create AnnData output objects", flush=True) +train_uns_keys = ["dataset_id", "dataset_organism"] +output_train = ad.AnnData( + layers={"counts": X_train}, + obs=adata_output.obs[[]], + var=adata_output.var[[]], + uns={key: adata_output.uns[key] for key in train_uns_keys} +) +test_uns_keys = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +output_test = ad.AnnData( + layers={"counts": X_test}, + obs=adata_output.obs[[]], + var=adata_output.var[[]], + uns={key: adata_output.uns[key] for key in test_uns_keys} +) + +# add additional information for the train set +output_test.uns["train_sum"] = X_train.sum() + +# Remove no cells that do not have enough reads +print(">> Remove cells that do not have enough reads", flush=True) +is_missing = np.array(X_train.sum(axis=0) == 0) + +output_train = output_train[:, ~is_missing.flatten()] +output_test = output_test[:, ~is_missing.flatten()] + +print(">> Write to file", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) diff --git a/src/methods/alra/config.vsh.yaml b/src/methods/alra/config.vsh.yaml new file mode 100644 index 0000000..7598429 --- /dev/null +++ b/src/methods/alra/config.vsh.yaml @@ -0,0 +1,47 @@ +__merge__: ../../api/comp_method.yaml + +name: "alra" +label: ALRA +summary: "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation, thresholding by gene, and rescaling the matrix." +description: | + Adaptively-thresholded Low Rank Approximation (ALRA). + + ALRA is a method for imputation of missing values in single cell RNA-sequencing data, + described in the preprint, "Zero-preserving imputation of scRNA-seq data using low-rank approximation" + available [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given a + scRNA-seq expression matrix, ALRA first computes its rank-k approximation using randomized SVD. + Next, each row (gene) is thresholded by the magnitude of the most negative value of that gene. + Finally, the matrix is rescaled. +references: + doi: 10.1101/397588 +links: + documentation: https://github.com/KlugerLab/ALRA/blob/master/README.md + repository: https://github.com/KlugerLab/ALRA +info: + v1: + path: openproblems/tasks/denoising/methods/alra.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + alra: + preferred_normalization: counts +arguments: + - name: "--norm" + type: string + choices: ["sqrt", "log"] + default: "log" + description: Normalization method +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ Matrix, rsvd ] + github: KlugerLab/ALRA +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/methods/alra/script.R b/src/methods/alra/script.R new file mode 100644 index 0000000..49bba72 --- /dev/null +++ b/src/methods/alra/script.R @@ -0,0 +1,53 @@ +cat(">> Loading dependencies\n") +library(anndata, warn.conflicts = FALSE) +library(ALRA, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input_train = "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad", + norm = "log", + output = "output.h5ad" +) +meta <- list( + name = "alra" +) +## VIASH END + +cat(">> Load input data\n") +input_train <- read_h5ad(par$input_train) + +cat(">> Set normalization method\n") +if (par$norm == "sqrt") { + norm_fn <- sqrt + denorm_fn <- function(x) x^2 +} else if (par$norm == "log") { + norm_fn <- log1p + denorm_fn <- expm1 +} else { + stop("Unknown normalization method: ", par$norm) +} + +cat(">> Normalize data\n") +data <- as.matrix(input_train$layers[["counts"]]) +totalPerCell <- rowSums(data) +data <- sweep(data, 1, totalPerCell, "/") +data <- norm_fn(data) + +cat(">> Run ALRA\n") +data <- alra(data)$A_norm_rank_k_cor_sc +data <- denorm_fn(data) +data <- sweep(data, 1, totalPerCell, "*") + +cat(">> Store output\n") +output <- AnnData( + layers = list(denoised = data), + obs = input_train$obs[, c(), drop = FALSE], + var = input_train$var[, c(), drop = FALSE], + uns = list( + dataset_id = input_train$uns[["dataset_id"]], + method_id = meta$name + ) +) + +cat(">> Write output to file\n") +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/methods/dca/config.vsh.yaml b/src/methods/dca/config.vsh.yaml new file mode 100644 index 0000000..343a032 --- /dev/null +++ b/src/methods/dca/config.vsh.yaml @@ -0,0 +1,52 @@ +__merge__: ../../api/comp_method.yaml +name: "dca" +label: DCA +summary: "A deep autoencoder with ZINB loss function to address the dropout effect in count data" +description: | + "Deep Count Autoencoder + + Removes the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account + using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function." +references: + doi: 10.1038/s41467-018-07931-2 +links: + documentation: "https://github.com/theislab/dca#readme" + repository: "https://github.com/theislab/dca" +info: + v1: + path: openproblems/tasks/denoising/methods/dca.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + dca: + preferred_normalization: counts +arguments: + - name: "--epochs" + type: "integer" + default: 300 + description: "Number of total epochs in training" +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.9 + setup: + - type: apt + packages: procps + - type: python + github: + - openproblems-bio/core#subdirectory=packages/python/openproblems + - type: python + packages: + - anndata~=0.8.0 + - scanpy + - pyyaml + - requests + - jsonschema + - "git+https://github.com/scottgigante-immunai/dca.git@patch-1" + - numpy<2 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/methods/dca/script.py b/src/methods/dca/script.py new file mode 100644 index 0000000..32c2c84 --- /dev/null +++ b/src/methods/dca/script.py @@ -0,0 +1,39 @@ +import anndata as ad +from dca.api import dca + +## VIASH START +par = { + 'input_train': 'resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad', + 'output': 'output_dca.h5ad', + 'epochs': 300, +} +meta = { + 'name': 'dca', +} +## VIASH END + +print("load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Remove unneeded data", flush=True) +output = ad.AnnData( + X=input_train.layers["counts"], + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Run DCA", flush=True) +dca(output, epochs=par["epochs"]) + +print("Move output to correct location", flush=True) +output.layers["denoised"] = output.X +del output.X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/methods/knn_smoothing/config.vsh.yaml b/src/methods/knn_smoothing/config.vsh.yaml new file mode 100644 index 0000000..d2a4e82 --- /dev/null +++ b/src/methods/knn_smoothing/config.vsh.yaml @@ -0,0 +1,47 @@ +__merge__: ../../api/comp_method.yaml + +name: "knn_smoothing" +label: KNN Smoothing +summary: "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing the size of neighbourhoods for smoothing until a maximum k value is reached." +description: "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq + expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first + applies initial normalisation and smoothing. Then, a chosen number of + principal components is used to calculate Euclidean distances between cells. + Minimally sized neighbourhoods are initially determined from these Euclidean + distances, and expression profiles are shared between neighbouring cells. + Then, the resultant smoothed matrix is used as input to the next step of + smoothing, where the size (k) of the considered neighbourhoods is increased, + leading to greater smoothing. This process continues until a chosen maximum k + value has been reached, at which point the iteratively smoothed object is + then optionally scaled to yield a final result." +references: + doi: 10.1101/217737 +links: + documentation: https://github.com/yanailab/knn-smoothing#readme + repository: https://github.com/yanailab/knn-smoothing +info: + v1: + path: openproblems/tasks/denoising/methods/knn_smoothing.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + knn_smoothing: + preferred_normalization: counts + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scipy + github: + - scottgigante-immunai/knn-smoothing@python_package +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/methods/knn_smoothing/script.py b/src/methods/knn_smoothing/script.py new file mode 100644 index 0000000..a0b0fa3 --- /dev/null +++ b/src/methods/knn_smoothing/script.py @@ -0,0 +1,39 @@ +import knn_smooth +import anndata as ad + +## VIASH START +par = { + 'input_train': 'resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad', + 'output': 'output_knn.h5ad', +} +meta = { + 'name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par["input_train"]) + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"].astype(float).transpose().toarray() + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Run KNN smoothing", flush=True) +X = knn_smooth.knn_smoothing(X, k=10).transpose() + +print("Process data", flush=True) +output.layers["denoised"] = X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/methods/magic/config.vsh.yaml b/src/methods/magic/config.vsh.yaml new file mode 100644 index 0000000..1bb3a94 --- /dev/null +++ b/src/methods/magic/config.vsh.yaml @@ -0,0 +1,66 @@ +__merge__: ../../api/comp_method.yaml +name: "magic" +label: MAGIC +summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone." +description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for + imputation and denoising of noisy or dropout-prone single cell RNA-sequencing + data. Given a normalised scRNA-seq expression matrix, it first calculates + Euclidean distances between each pair of cells in the dataset, which is then + augmented using a Gaussian kernel (function) and row-normalised to give a + normalised affinity matrix. A t-step markov process is then calculated, by + powering this affinity matrix t times. Finally, the powered affinity matrix + is right-multiplied by the normalised data, causing the final imputed values + to take the value of a per-gene average weighted by the affinities of cells. + The resultant imputed matrix is then rescaled, to more closely match the + magnitude of measurements in the normalised (input) matrix." +references: + doi: 10.1016/j.cell.2018.05.061 +links: + documentation: https://github.com/KrishnaswamyLab/MAGIC#readme + repository: https://github.com/KrishnaswamyLab/MAGIC +info: + v1: + path: openproblems/tasks/denoising/methods/magic.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + magic: + magic_approx: + solver: approximate + magic_knn_naive: + norm: log + decay: none + t: 1 + preferred_normalization: counts +arguments: + - name: "--solver" + type: "string" + choices: ["exact", "approximate"] + default: "exact" + description: Which solver to use. + - name: "--norm" + type: string + choices: ["sqrt", "log"] + default: "log" + description: Normalization method + - name: "--decay" + type: integer + default: 1 + description: sets decay rate of kernel tails + - name: "--t" + type: integer + default: 3 + description: power to which the diffusion operator is powered +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pip: [scprep, magic-impute, scipy, scikit-learn<1.2, numpy<2] +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/methods/magic/script.py b/src/methods/magic/script.py new file mode 100644 index 0000000..67a25c6 --- /dev/null +++ b/src/methods/magic/script.py @@ -0,0 +1,75 @@ +import anndata as ad +import numpy as np +import scprep +from magic import MAGIC +import scipy + + +## VIASH START +par = { + "input_train": "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad", + "output": "output_magic.h5ad", + "solver": "exact", + "norm": "sqrt", + "decay": 1, + "t": 3, +} +meta = { + "name": "foo", +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par["input_train"]) + +print("Set normalization method", flush=True) +if par["norm"] == "sqrt": + norm_fn = np.sqrt + denorm_fn = np.square +elif par["norm"] == "log": + norm_fn = np.log1p + denorm_fn = np.expm1 +else: + raise ValueError("Unknown normalization method: " + par["norm"] + ".") + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"] + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Normalize data", flush=True) +X, libsize = scprep.normalize.library_size_normalize( + X, + rescale=1, + return_library_size=True +) +X = scprep.utils.matrix_transform(X, norm_fn) + +print("Run MAGIC", flush=True) +magic = MAGIC( + solver=par["solver"], + decay=par["decay"], + t=par["t"], + verbose=False, +) +X = magic.fit_transform(X, genes="all_genes") + +print("Denormalizing data", flush=True) +X = scprep.utils.matrix_transform(X, denorm_fn) +X = scprep.utils.matrix_vector_elementwise_multiply(X, libsize, axis=0) + +print("Create output AnnData", flush=True) +output.layers["denoised"] = X + +print("Write Data", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/methods/saver/config.vsh.yaml b/src/methods/saver/config.vsh.yaml new file mode 100644 index 0000000..3d07668 --- /dev/null +++ b/src/methods/saver/config.vsh.yaml @@ -0,0 +1,36 @@ +__merge__: ../../api/comp_method.yaml + +name: saver +status: disabled +label: SAVER +summary: SAVER (Single-cell Analysis Via Expression Recovery) implements a regularized regression prediction and empirical Bayes method to recover the true gene expression profile. +description: | + SAVER takes advantage of gene-to-gene relationships to recover the true expression level of each gene in each cell, + removing technical variation while retaining biological variation across cells (https://github.com/mohuangx/SAVER). + SAVER uses a post-quality-control scRNA-seq dataset with UMI counts as input. SAVER assumes that the count of each + gene in each cell follows a Poisson-gamma mixture, also known as a negative binomial model. Instead of specifying + the gamma prior, we estimate the prior parameters in an empirical Bayes-like approach with a Poisson LASSO regression, + using the expression of other genes as predictors. Once the prior parameters are estimated, SAVER outputs the + posterior distribution of the true expression, which quantifies estimation uncertainty, and the posterior mean is + used as the SAVER recovered expression value. +references: + doi: 10.1038/s41592-018-0033-z +links: + documentation: https://mohuangx.github.io/SAVER/index.html + repository: https://github.com/mohuangx/SAVER +info: + preferred_normalization: counts +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + github: mohuangx/SAVER +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/methods/saver/script.R b/src/methods/saver/script.R new file mode 100644 index 0000000..ab1d658 --- /dev/null +++ b/src/methods/saver/script.R @@ -0,0 +1,39 @@ +cat(">> Loading dependencies\n") +library(anndata, warn.conflicts = FALSE) +library(SAVER, warn.conflicts = FALSE) +library(Matrix, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input_train = "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad", + norm = "log", + output = "output.h5ad" +) +meta <- list( + name = "saver", + ncpus = 30 +) +## VIASH END + +cat(">> Load input data\n") +input_train <- read_h5ad(par$input_train, backed = "r") + +cat(">> Normalize data\n") +data <- as(t(input_train$layers[["counts"]]), "CsparseMatrix") + +cat(">> Run SAVER\n") +data <- t(saver(data, ncores = meta$ncpus, estimates.only = TRUE)) + +cat(">> Store output\n") +output <- AnnData( + layers = list(denoised = data), + obs = input_train$obs[, c(), drop = FALSE], + var = input_train$var[, c(), drop = FALSE], + uns = list( + dataset_id = input_train$uns[["dataset_id"]], + method_id = meta$name + ) +) + +cat(">> Write output to file\n") +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/methods/scprint/config.vsh.yaml b/src/methods/scprint/config.vsh.yaml new file mode 100644 index 0000000..2886ecb --- /dev/null +++ b/src/methods/scprint/config.vsh.yaml @@ -0,0 +1,77 @@ +__merge__: /src/api/base_method.yaml + +name: scprint +label: scPRINT +summary: scPRINT is a large transformer model built for the inference of gene networks +description: | + scPRINT is a large transformer model built for the inference of gene networks + (connections between genes explaining the cell's expression profile) from + scRNAseq data. + + It uses novel encoding and decoding of the cell expression profile and new + pre-training methodologies to learn a cell model. + + scPRINT can be used to perform the following analyses: + + - expression denoising: increase the resolution of your scRNAseq data + - cell embedding: generate a low-dimensional representation of your dataset + - label prediction: predict the cell type, disease, sequencer, sex, and + ethnicity of your cells + - gene network inference: generate a gene network from any cell or cell + cluster in your scRNAseq dataset + +references: + doi: + - 10.1101/2024.07.29.605556 + +links: + documentation: https://cantinilab.github.io/scPRINT/ + repository: https://github.com/cantinilab/scPRINT + +info: + preferred_normalization: counts + variants: + scprint_large: + model_name: "large" + scprint_medium: + model_name: "medium" + scprint_small: + model_name: "small" + +arguments: + - name: "--model_name" + type: "string" + description: Which model to use. Not used if --model is provided. + choices: ["large", "medium", "small"] + default: "large" + - name: --model + type: file + description: Path to the scPRINT model. + required: false + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + pip: + - huggingface_hub + - scprint + - type: docker + run: lamin init --storage ./main --name main --schema bionty + - type: python + script: import bionty as bt; bt.core.sync_all_sources_to_latest() + - type: docker + run: lamin load anonymous/main + - type: python + script: from scdataloader.utils import populate_my_ontology; populate_my_ontology() + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu, gpu] diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py new file mode 100644 index 0000000..e5f4c4a --- /dev/null +++ b/src/methods/scprint/script.py @@ -0,0 +1,115 @@ +import os + +import anndata as ad +import scprint +import torch +from huggingface_hub import hf_hub_download +from scdataloader import Preprocessor +from scprint import scPrint +from scprint.tasks import Denoiser +import numpy as np + +## VIASH START +par = { + "input_train": "resources_test/task_batch_integration/cxg_immune_cell_atlas/train.h5ad", + "output": "output.h5ad", + "model_name": "large", + "model": None, +} +meta = {"name": "scprint"} +## VIASH END + +print(f"====== scPRINT version {scprint.__version__} ======", flush=True) + +print("\n>>> Reading input data...", flush=True) +input = ad.read_h5ad(par["input_train"]) +print(input) + +print("\n>>> Preprocessing data...", flush=True) +adata = ad.AnnData( + X=input.layers["counts"] +) +adata.obs_names = input.obs_names +adata.var_names = input.var_names +if input.uns["dataset_organism"] == "homo_sapiens": + adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606" +elif input.uns["dataset_organism"] == "mus_musculus": + adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090" +else: + raise ValueError( + f"scPRINT requires human or mouse data, not '{input.uns['dataset_organism']}'" + ) + +preprocessor = Preprocessor( + # Lower this threshold for test datasets + min_valid_genes_id=1000 if input.n_vars < 2000 else 10000, + # Turn off cell filtering to return results for all cells + filter_cell_by_counts=False, + min_nnz_genes=False, + do_postp=False, + # Skip ontology checks + skip_validate=True, +) +adata = preprocessor(adata) +print(adata) + +model_checkpoint_file = par["model"] +if model_checkpoint_file is None: + print(f"\n>>> Downloading '{par['model_name']}' model...", flush=True) + model_checkpoint_file = hf_hub_download( + repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt" + ) +print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True) +model = scPrint.load_from_checkpoint( + model_checkpoint_file, + transformer="normal", # Don't use this for GPUs with flashattention + precpt_gene_emb=None, +) + +print("\n>>> Denoising data...", flush=True) +if torch.cuda.is_available(): + print("CUDA is available, using GPU", flush=True) + precision = "16-mixed" + dtype = torch.float16 +else: + print("CUDA is not available, using CPU", flush=True) + precision = "32" + dtype = torch.float32 +n_cores_available = len(os.sched_getaffinity(0)) +print(f"Using {n_cores_available} worker cores") +denoiser = Denoiser( + num_workers=n_cores_available, + precision=precision, + max_cells=adata.n_obs, + doplot=False, + dtype=dtype, +) +_, idxs, genes, expr_pred = denoiser(model, adata) +print(f"Predicted expression dimensions: {expr_pred.shape}") + +print("\n>>> Applying denoising...", flush=True) +adata.X = adata.X.tolil() +idxs = idxs if idxs is not None else range(adata.shape[0]) +for i, idx in enumerate(idxs): + adata.X[idx, adata.var.index.get_indexer(genes)] = expr_pred[i] +adata.X = adata.X.tocsr() +print(adata) + +print("\n>>> Storing output...", flush=True) +output = ad.AnnData( + layers={ + "denoised": adata.X[:, adata.var.index.get_indexer(input.var_names)], + }, + obs=input.obs[[]], + var=input.var[[]], + uns={ + "dataset_id": input.uns["dataset_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\n>>> Writing output AnnData to file...", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\n>>> Done!", flush=True) diff --git a/src/metrics/mse/config.vsh.yaml b/src/metrics/mse/config.vsh.yaml new file mode 100644 index 0000000..94e800a --- /dev/null +++ b/src/metrics/mse/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_metric.yaml +name: "mse" +info: + metrics: + - name: mse + label: Mean-squared error + summary: "The mean squared error between the denoised counts and the true counts." + description: "The mean squared error between the denoised counts of the training dataset and the true counts of the test dataset after reweighing by the train/test ratio" + references: + doi: 10.1101/786269 + v1: + path: openproblems/tasks/denoising/metrics/mse.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + maximize: false + min: 0 + max: "+.inf" +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scikit-learn + - scprep + - numpy<2 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/metrics/mse/script.py b/src/metrics/mse/script.py new file mode 100644 index 0000000..8d70589 --- /dev/null +++ b/src/metrics/mse/script.py @@ -0,0 +1,50 @@ +import anndata as ad +import scanpy as sc +import sklearn.metrics +import scprep + +## VIASH START +par = { + 'input_test': 'resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad', + 'input_prediction': 'resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad', + 'output': 'output_mse.h5ad' +} +meta = { + 'name': 'mse' +} +## VIASH END + +print("Load data", flush=True) +input_denoised = ad.read_h5ad(par['input_prediction']) +input_test = ad.read_h5ad(par['input_test']) + +test_data = ad.AnnData(X=input_test.layers["counts"]) +denoised_data = ad.AnnData(X=input_denoised.layers["denoised"]) + +print("Normalize data", flush=True) + +# scaling and transformation +target_sum = 10000 + +sc.pp.normalize_total(test_data, target_sum=target_sum) +sc.pp.log1p(test_data) + +sc.pp.normalize_total(denoised_data, target_sum=target_sum) +sc.pp.log1p(denoised_data) + +print("Compute mse value", flush=True) +error = sklearn.metrics.mean_squared_error( + scprep.utils.toarray(test_data.X), scprep.utils.toarray(denoised_data.X) +) + +print("Store mse value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") diff --git a/src/metrics/poisson/config.vsh.yaml b/src/metrics/poisson/config.vsh.yaml new file mode 100644 index 0000000..47742a7 --- /dev/null +++ b/src/metrics/poisson/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_metric.yaml +name: "poisson" +info: + metrics: + - name: poisson + label: Poisson Loss + summary: "The Poisson log likelihood of the true counts observed in the distribution of denoised counts" + description: "The Poisson log likelihood of observing the true counts of the test dataset given the distribution given in the denoised dataset." + references: + doi: 10.1101/786269 + v1: + path: openproblems/tasks/denoising/metrics/poisson.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + maximize: false + min: 0 + max: "+.inf" +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scprep + - numpy<2 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, midcpu] \ No newline at end of file diff --git a/src/metrics/poisson/script.py b/src/metrics/poisson/script.py new file mode 100644 index 0000000..43caef3 --- /dev/null +++ b/src/metrics/poisson/script.py @@ -0,0 +1,46 @@ +import anndata as ad +import scprep +import numpy as np + +## VIASH START +par = { + 'input_test': 'resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad', + 'input_prediction': 'resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad', + 'output': 'output_poisson.h5ad' +} +meta = { + 'name': 'poisson' +} +## VIASH END + +print("Load Data", flush=True) +input_denoised = ad.read_h5ad(par['input_prediction']) +input_test = ad.read_h5ad(par['input_test']) + +test_data = scprep.utils.toarray(input_test.layers["counts"]) +denoised_data = scprep.utils.toarray(input_denoised.layers["denoised"]) + +print("Compute metric value", flush=True) +# scaling +initial_sum = input_test.uns["train_sum"] +target_sum = test_data.sum() +denoised_data = denoised_data * target_sum / initial_sum + +# from molecular_cross_validation.mcv_sweep import poisson_nll_loss +# copied from: https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/mcv_sweep.py +def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float: + return (y_pred - y_true * np.log(y_pred + 1e-6)).mean() + +error = poisson_nll_loss(test_data, denoised_data) + +print("Store poisson value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000..7d98b4b --- /dev/null +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,30 @@ +name: "process_datasets" +namespace: "workflows" +argument_groups: + - name: Inputs + arguments: + - name: "--input" + required: true + example: dataset.h5ad + __merge__: "/src/api/file_common_dataset.yaml" + - name: Outputs + arguments: + - name: "--output_train" + __merge__: "/src/api/file_train.yaml" + direction: output + required: true + - name: "--output_test" + __merge__: "/src/api/file_test.yaml" + direction: output + required: true +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: schema/verify_data_structure + repository: core + - name: data_processors/process_dataset +runners: + - type: nextflow diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf new file mode 100644 index 0000000..8b21d78 --- /dev/null +++ b/src/workflows/process_datasets/main.nf @@ -0,0 +1,54 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | verify_data_structure.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test"]) + + emit: + output_ch +} diff --git a/src/workflows/process_datasets/run_test.sh b/src/workflows/process_datasets/run_test.sh new file mode 100755 index 0000000..fe63acd --- /dev/null +++ b/src/workflows/process_datasets/run_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Run this prior to executing this script: +# bin/viash_build -q 'batch_integration' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --id run_test \ + --input_states "resources_test/common/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \ + --publish_dir "resources_test/task_denoising" \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000..6850e1d --- /dev/null +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,82 @@ +name: "run_benchmark" +namespace: "workflows" +argument_groups: + - name: Inputs + arguments: + - name: "--input_train" + __merge__: "/src/api/file_train.yaml" + required: true + direction: input + - name: "--input_test" + __merge__: "/src/api/file_test.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Method filtering + description: | + Use these arguments to filter methods by name. By default, all methods are + run. If `--methods_include` is defined, only those methods are run. If + `--methods_exclude` is defined, all methods except those specified are run. + These arguments are mutually exclusive, so only `--methods_include` OR + `--methods_exclude` can set but not both. + arguments: + - name: "--methods_include" + type: string + multiple: true + description: | + A list of method ids to include. If specified, only these methods will be run. + - name: "--methods_exclude" + type: string + multiple: true + description: | + A list of method ids to exclude. If specified, all methods except the ones listed will be run. + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /_viash.yaml + - path: /common/nextflow_helpers/helper.nf + +dependencies: + - name: h5ad/extract_uns_metadata + repository: core + - name: control_methods/no_denoising + - name: control_methods/perfect_denoising + - name: methods/alra + - name: methods/dca + - name: methods/knn_smoothing + - name: methods/magic + - name: methods/scprint + - name: metrics/mse + - name: metrics/poisson +runners: + - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf new file mode 100644 index 0000000..725ac72 --- /dev/null +++ b/src/workflows/run_benchmark/main.nf @@ -0,0 +1,193 @@ +include { checkItemAllowed } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + no_denoising, + perfect_denoising, + alra, + dca, + knn_smoothing, + magic, + scprint + ] + + // construct list of metrics + metrics = [ + mse, + poisson + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_uns_metadata.run( + fromState: [input: "input_test"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a defined method or all methods + filter: { id, state, comp -> + def method_check = checkItemAllowed( + comp.config.name, + state.methods_include, + state.methods_exclude, + "methods_include", + "methods_exclude" + ) + + method_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_train: "input_train", + input_test: "input_test" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_test: "input_test", + input_prediction: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.name, + metric_output: output.output + ] + } + ) + + // extract the scores + | extract_uns_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + ["output", [output_scores: score_uns_file]] + } + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + // NOTE: the 'denoising' task doesn't use normalized data, + // so code related to normalization_ids is commented out + + // extract the dataset metadata + meta_ch = dataset_ch + // // only keep one of the normalization methods + // | filter{ id, state -> + // state.dataset_uns.normalization_id == "log_cp10k" + // } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + // uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("_viash.yaml") + + // create output + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_dataset_info: dataset_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + output_ch = score_ch + | mix(meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} diff --git a/src/workflows/run_benchmark/run_test.sh b/src/workflows/run_benchmark/run_test.sh new file mode 100755 index 0000000..1f4559c --- /dev/null +++ b/src/workflows/run_benchmark/run_test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/denoising" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_train:output_train;input_test:output_test' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" diff --git a/target/.build.yaml b/target/.build.yaml new file mode 100644 index 0000000..e69de29 diff --git a/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/.config.vsh.yaml b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/.config.vsh.yaml new file mode 100644 index 0000000..4844b35 --- /dev/null +++ b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/.config.vsh.yaml @@ -0,0 +1,183 @@ +name: "extract_uns_metadata" +namespace: "h5ad" +version: "build_main" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "A h5ad file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--schema" + description: "An optional schema with which to annotate the output" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--uns_length_cutoff" + description: "The maximum length of the .uns metadata to extract. If a value in\ + \ uns is a list or a dictionary with more elements than the provided cutoff,\ + \ it will not be extracted." + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "A yaml file containing the metadata." + info: null + example: + - "output_meta.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +description: "Extract .uns metadata from an h5ad file and write it to a yaml file." +test_resources: +- type: "file" + path: "pancreas" +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/core" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/h5ad/extract_uns_metadata/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/h5ad/extract_uns_metadata" + executable: "target/nextflow/h5ad/extract_uns_metadata/main.nf" + viash_version: "0.9.0" + git_commit: "405c288a53c9a011b41688a47a84c249aa7ba586" + git_remote: "https://github.com/openproblems-bio/core" +package_config: + name: "core" + version: "build_main" + description: "Core components for the OpenProblems project.\n" + info: + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/common/pancreas" + dest: "resources_test/common/pancreas" + viash_version: "0.9.0" + source: "src" + target: "target" + keywords: + - "openproblems" + - "common-resources" + - "single-cell" + - "benchmark" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/core" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/core/issues" diff --git a/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/main.nf b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/main.nf new file mode 100644 index 0000000..f91e082 --- /dev/null +++ b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/main.nf @@ -0,0 +1,3773 @@ +// extract_uns_metadata build_main +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "extract_uns_metadata", + "namespace" : "h5ad", + "version" : "build_main", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "A h5ad file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--schema", + "description" : "An optional schema with which to annotate the output", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--uns_length_cutoff", + "description" : "The maximum length of the .uns metadata to extract. If a value in uns is a list or a dictionary with more elements than the provided cutoff, it will not be extracted.", + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "A yaml file containing the metadata.", + "example" : [ + "output_meta.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "description" : "Extract .uns metadata from an h5ad file and write it to a yaml file.", + "test_resources" : [ + { + "type" : "file", + "path" : "/resources_test/common/pancreas" + }, + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/core", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/core/core/viash/core/src/h5ad/extract_uns_metadata/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/h5ad/extract_uns_metadata", + "viash_version" : "0.9.0", + "git_commit" : "405c288a53c9a011b41688a47a84c249aa7ba586", + "git_remote" : "https://github.com/openproblems-bio/core" + }, + "package_config" : { + "name" : "core", + "version" : "build_main", + "description" : "Core components for the OpenProblems project.\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/pancreas", + "dest" : "resources_test/common/pancreas" + } + ] + }, + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "keywords" : [ + "openproblems", + "common-resources", + "single-cell", + "benchmark" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/core", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/core/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad +import yaml +import numpy as np +import pandas as pd +import scipy +import os +import datetime + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'schema': $( if [ ! -z ${VIASH_PAR_SCHEMA+x} ]; then echo "r'${VIASH_PAR_SCHEMA//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_length_cutoff': $( if [ ! -z ${VIASH_PAR_UNS_LENGTH_CUTOFF+x} ]; then echo "int(r'${VIASH_PAR_UNS_LENGTH_CUTOFF//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input']).copy() + +if par["schema"]: + print("Load schema", flush=True) + with open(par["schema"], "r") as f: + schema = yaml.safe_load(f) + + schema_info = schema.get("info") or {} + assert schema_info, "Schema must contain an 'info' field" + + schema_info_format = schema_info.get("format") or {} + assert schema_info_format, "Schema must contain a '.info.format' field" + + assert schema_info_format.get("type") == "h5ad", ".info.format.type must be 'h5ad'" +else: + schema = None + +#################################################################################################### +## Helper functions for extracting the dataset metadata in uns ## +#################################################################################################### +def is_atomic(obj): + return isinstance(obj, str) or isinstance(obj, int) or isinstance(obj, bool) or isinstance(obj, float) + +def to_atomic(obj): + if isinstance(obj, np.float64): + return float(obj) + elif isinstance(obj, np.int64): + return int(obj) + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.str_): + return str(obj) + return obj + +def is_list_of_atomics(obj): + if not isinstance(obj, (list,pd.core.series.Series,np.ndarray)): + return False + return all(is_atomic(elem) for elem in obj) + +def to_list_of_atomics(obj): + if isinstance(obj, pd.core.series.Series): + obj = obj.to_numpy() + if isinstance(obj, np.ndarray): + obj = obj.tolist() + return [to_atomic(elem) for elem in obj] + +def is_dict_of_atomics(obj): + if not isinstance(obj, dict): + return False + return all(is_atomic(elem) for _, elem in obj.items()) + +def to_dict_of_atomics(obj): + return {k: to_atomic(v) for k, v in obj.items()} + + +#################################################################################################### +## Helper functions for extracting metadata about the used data structures ## +#################################################################################################### +def get_structure_shape(obj) -> list: + if isinstance(obj, np.ndarray): + return list(obj.shape) + elif scipy.sparse.issparse(obj): + return list(obj.shape) + elif isinstance(obj, pd.core.frame.DataFrame): + return list(obj.shape) + elif isinstance(obj, pd.core.series.Series): + return list(obj.shape) + elif isinstance(obj, list): + return [len(obj)] + elif isinstance(obj, dict): + return [len(obj)] + elif is_atomic(obj): + return [1] + return None + +def get_structure_type(obj) -> str: + # return one of: atomic, dataFrame, vector, dict, denseMatrix, sparseMatrix + if is_atomic(obj): + return "atomic" + elif isinstance(obj, (list,pd.core.series.Series)): + return "vector" + elif isinstance(obj, dict): + return "dict" + elif isinstance(obj, pd.core.frame.DataFrame): + return "dataframe" + elif scipy.sparse.issparse(obj): + return "sparsematrix" + elif isinstance(obj, np.ndarray): + return "densematrix" + return "other: " + str(type(obj)) + +def get_structure_dtype(obj) -> str: + if isinstance(obj, np.ndarray): + return obj.dtype.name + elif isinstance(obj, pd.core.series.Series): + return obj.dtype.name + elif isinstance(obj, pd.core.frame.DataFrame): + return [dtype.name for dtype in obj.dtypes] + elif scipy.sparse.issparse(obj): + return obj.dtype.name + elif is_atomic(obj): + return type(obj).__name__ + return None + +def get_structure_schema_info(struct, key) -> dict: + if schema is None: + return {} + + struct_args = schema_info_format.get(struct, {}) + if struct_args is None: + return {} + if struct == "X": + return struct_args + + # look for item with the correct name + struct_results = [x for x in struct_args if x.get("name") == key] + + # return None if no match is found + if len(struct_results) != 1: + return {} + + return struct_results[0] + +def get_structure(adata, struct): + adata_struct = getattr(adata, struct) + + # turn \\`adata_struct\\` into a dict for \\`X\\` + if (struct == "X"): + adata_struct = {"X": adata_struct} if adata_struct is not None else {} + + output = [] + + for key, value in adata_struct.items(): + out = { + "name": key, + "type": get_structure_type(value), + "shape": get_structure_shape(value), + "dtype": get_structure_dtype(value), + } + + # see if the schema has information about this struct + schema_info = get_structure_schema_info(struct, key) + + copy = { + "description": "description", + "summary": "summary", + "label": "label", + "schema_type": "type" + } + for k, v in copy.items(): + if schema_info.get(v): + out[k] = schema_info.get(v) + + output.append(out) + + return output + +#################################################################################################### +## Other helper functions ## +#################################################################################################### + +def get_file_size(path: str) -> int: + """Get the file size in bytes of the file at the given path.""" + return os.path.getsize(path) + +def get_file_creation_time(path: str) -> str: + """Get the creation time of the file at the given path.""" + # Get file creation time + creation_time = os.path.getctime(path) + # Convert creation time from seconds since epoch to a readable timestamp + creation_time = datetime.datetime.fromtimestamp(creation_time) + # Format the datetime object as 'DD-MM-YYYY' + creation_time = creation_time.strftime('%d-%m-%Y') + return str(creation_time) + +print("Extract metadata from object", flush=True) +# Extract metadata about the adata object +uns = {} +for key, val in adata.uns.items(): + if is_atomic(val): + uns[key] = to_atomic(val) + elif is_list_of_atomics(val) and len(val) <= par["uns_length_cutoff"]: + uns[key] = to_list_of_atomics(val) + elif is_dict_of_atomics(val) and len(val) <= par["uns_length_cutoff"]: + uns[key] = to_dict_of_atomics(val) + +uns["file_size"] = get_file_size(par["input"]) +uns["date_created"] = get_file_creation_time(par["input"]) + +# Extract metadata about the data structures +structure = { + struct: get_structure(adata, struct) + for struct + in ["X", "obs", "var", "obsp", "varp", "obsm", "varm", "layers", "uns"] +} + +# ¢reate metadata object +meta = {"uns": uns, "structure": structure} + +print("Write metadata to file", flush=True) +with open(par["output"], "w") as f: + yaml.dump(meta, f, indent=2) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/core/h5ad/extract_uns_metadata", + "tag" : "build_main" + }, + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/nextflow.config b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/nextflow.config new file mode 100644 index 0000000..28fb3d5 --- /dev/null +++ b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'h5ad/extract_uns_metadata' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'build_main' + description = 'Extract .uns metadata from an h5ad file and write it to a yaml file.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + + diff --git a/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/.config.vsh.yaml b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/.config.vsh.yaml new file mode 100644 index 0000000..ed039e1 --- /dev/null +++ b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/.config.vsh.yaml @@ -0,0 +1,187 @@ +name: "verify_data_structure" +namespace: "schema" +version: "build_main" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "An input file. Can be an .h5ad, .parquet, .csv, or .tsv file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--schema" + description: "A schema file for the input object." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean" + name: "--stop_on_error" + description: "Whether or not to stop with exit code 1 if the input file does not\ + \ adhere to the schema." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "If specified, this file will contain a structured log of which checks\ + \ succeeded (or not)." + info: null + example: + - "checks.json" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +summary: "Checks a file against a schema" +description: "Checks if the file has the necessary data structures as defined in a\ + \ schema." +test_resources: +- type: "file" + path: "pancreas" +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/core" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/schema/verify_file_structure/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/schema/verify_data_structure" + executable: "target/nextflow/schema/verify_data_structure/main.nf" + viash_version: "0.9.0" + git_commit: "405c288a53c9a011b41688a47a84c249aa7ba586" + git_remote: "https://github.com/openproblems-bio/core" +package_config: + name: "core" + version: "build_main" + description: "Core components for the OpenProblems project.\n" + info: + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/common/pancreas" + dest: "resources_test/common/pancreas" + viash_version: "0.9.0" + source: "src" + target: "target" + keywords: + - "openproblems" + - "common-resources" + - "single-cell" + - "benchmark" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/core" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/core/issues" diff --git a/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/main.nf b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/main.nf new file mode 100644 index 0000000..fd6730e --- /dev/null +++ b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/main.nf @@ -0,0 +1,3652 @@ +// verify_data_structure build_main +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "verify_data_structure", + "namespace" : "schema", + "version" : "build_main", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "An input file. Can be an .h5ad, .parquet, .csv, or .tsv file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--schema", + "description" : "A schema file for the input object.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--stop_on_error", + "description" : "Whether or not to stop with exit code 1 if the input file does not adhere to the schema.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "If specified, this file will contain a structured log of which checks succeeded (or not).", + "example" : [ + "checks.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "summary" : "Checks a file against a schema", + "description" : "Checks if the file has the necessary data structures as defined in a schema.", + "test_resources" : [ + { + "type" : "file", + "path" : "/resources_test/common/pancreas" + }, + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/core", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/core/core/viash/core/src/schema/verify_file_structure/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/schema/verify_data_structure", + "viash_version" : "0.9.0", + "git_commit" : "405c288a53c9a011b41688a47a84c249aa7ba586", + "git_remote" : "https://github.com/openproblems-bio/core" + }, + "package_config" : { + "name" : "core", + "version" : "build_main", + "description" : "Core components for the OpenProblems project.\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/pancreas", + "dest" : "resources_test/common/pancreas" + } + ] + }, + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "keywords" : [ + "openproblems", + "common-resources", + "single-cell", + "benchmark" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/core", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/core/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad +import pandas as pd +import yaml +import json + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'schema': $( if [ ! -z ${VIASH_PAR_SCHEMA+x} ]; then echo "r'${VIASH_PAR_SCHEMA//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'stop_on_error': $( if [ ! -z ${VIASH_PAR_STOP_ON_ERROR+x} ]; then echo "r'${VIASH_PAR_STOP_ON_ERROR//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# TODO: need to refactor to reuse the same helper functions as in 'run_and_check_output.py'. + +def check_h5ad_struct(struc, struc_fields, adata_slot): + missing = [] + if struc == "X": + struc_fields["name"] = "X" + struc_fields = [struc_fields] + for obj in struc_fields: + adata_data = adata_slot.get(obj['name']) if struc != 'X' else adata_slot + if obj.get('required') and adata_data is None: + missing.append(obj['name']) + # todo: check types + return missing + +def check_df_columns(df, columns): + missing = [] + for col in columns: + if col not in df.columns: + missing.append(col) + return missing + +print("Load schema", flush=True) +with open(par["schema"], "r") as f: + schema = yaml.safe_load(f) + +schema_info = schema.get("info") +assert schema_info, "Schema must contain an 'info' field" + +schema_info_format = schema_info.get("format") +assert schema_info_format, "Schema must contain a '.info.format' field" + +format_type = schema_info_format.get("type") +assert format_type == "h5ad", ".info.format.type must be 'h5ad'" + +# create output data structure +out = { + "exit_code": 0, + "error": {}, + "data_schema": "ok" +} + +print('Load data', flush=True) +if format_type == "h5ad": + data = ad.read_h5ad(par['input']) +elif format_type == "csv": + data = pd.read_csv(par['input']) +elif format_type == "tsv": + data = pd.read_csv(par['input'], sep="\\\\t") +elif format_type == "parquet": + data = pd.read_parquet(par['input']) +else: + raise ValueError(f"Unknown .info.format.type '{format_type}'") + +out = { + "exit_code": 0, + "error": {}, + "data_schema": "ok" +} +print("Check file against schema", flush=True) +if format_type == "h5ad": + for struc, struc_fields in schema_info_format.items(): + if struc == "type": + continue + print("Checking slot", struc, flush=True) + missing = check_h5ad_struct(struc, struc_fields, getattr(data, struc)) + if missing: + print(f"Dataset is missing {struc} {missing}", flush=True) + out['exit_code'] = 1 + out['data_schema'] = 'not ok' + out['error'][struc] = missing +elif format_type in ["csv", "tsv", "parquet"]: + columns = schema_info_format.get("columns") or [] + missing = check_df_columns(data, columns) + +with open(par["output"], "w") as f: + json.dump(out, f, indent=2) + +if par['stop_on_error']: + exit(out['exit_code']) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/core/schema/verify_data_structure", + "tag" : "build_main" + }, + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/nextflow.config b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/nextflow.config new file mode 100644 index 0000000..e046bca --- /dev/null +++ b/target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'schema/verify_data_structure' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'build_main' + description = 'Checks if the file has the necessary data structures as defined in a schema.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + + diff --git a/target/executable/control_methods/no_denoising/.config.vsh.yaml b/target/executable/control_methods/no_denoising/.config.vsh.yaml new file mode 100644 index 0000000..e0f5eea --- /dev/null +++ b/target/executable/control_methods/no_denoising/.config.vsh.yaml @@ -0,0 +1,300 @@ +name: "no_denoising" +namespace: "control_methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "No Denoising" +summary: "negative control by copying train counts" +description: "This method serves as a negative control, where the denoised data is\ + \ a copy of the unaltered training data. This represents the scoring threshold if\ + \ denoising was not performed on the data." +test_resources: +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/baseline.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "control_method" + type_info: + label: "Control Method" + summary: "A control method." + description: "These components have the same interface as the regular methods\n\ + but also receive the solution object as input. It serves as a\nstarting point\ + \ to test the relative accuracy of new methods in\nthe task, and also as a quality\ + \ control for the metrics defined\nin the task.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + entrypoint: [] + cmd: null +build_info: + config: "src/control_methods/no_denoising/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/control_methods/no_denoising" + executable: "target/executable/control_methods/no_denoising/no_denoising" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/control_methods/no_denoising/no_denoising b/target/executable/control_methods/no_denoising/no_denoising new file mode 100755 index 0000000..48ba15a --- /dev/null +++ b/target/executable/control_methods/no_denoising/no_denoising @@ -0,0 +1,1111 @@ +#!/usr/bin/env bash + +# no_denoising 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="no_denoising" +VIASH_META_FUNCTIONALITY_NAME="no_denoising" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "no_denoising 1.0.0" + echo "" + echo "This method serves as a negative control, where the denoised data is a copy of" + echo "the unaltered training data. This represents the scoring threshold if denoising" + echo "was not performed on the data." + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --input_test" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +LABEL org.opencontainers.image.description="Companion container for running component control_methods no_denoising" +LABEL org.opencontainers.image.created="2024-12-19T16:15:28Z" +LABEL org.opencontainers.image.source="https://github.com/openproblems-bio/task_denoising" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "no_denoising 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_test) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_test. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_test=*) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test=*\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/control_methods/no_denoising:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_TEST+x} ]; then + ViashError '--input_test' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_TEST" ] && [ ! -e "$VIASH_PAR_INPUT_TEST" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TEST' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TEST")" ) + VIASH_PAR_INPUT_TEST=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TEST") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-no_denoising-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_train.layers['counts'] + +input_train.uns["method_id"] = meta['name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_PAR_INPUT_TEST=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TEST") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/control_methods/perfect_denoising/.config.vsh.yaml b/target/executable/control_methods/perfect_denoising/.config.vsh.yaml new file mode 100644 index 0000000..3aba3d1 --- /dev/null +++ b/target/executable/control_methods/perfect_denoising/.config.vsh.yaml @@ -0,0 +1,300 @@ +name: "perfect_denoising" +namespace: "control_methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "Perfect Denoising" +summary: "Positive control by copying the test counts" +description: "This method serves as a positive control, where the test data is copied\ + \ 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised\ + \ as it will be compared to the test data in the metrics." +test_resources: +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/baseline.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "control_method" + type_info: + label: "Control Method" + summary: "A control method." + description: "These components have the same interface as the regular methods\n\ + but also receive the solution object as input. It serves as a\nstarting point\ + \ to test the relative accuracy of new methods in\nthe task, and also as a quality\ + \ control for the metrics defined\nin the task.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + entrypoint: [] + cmd: null +build_info: + config: "src/control_methods/perfect_denoising/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/control_methods/perfect_denoising" + executable: "target/executable/control_methods/perfect_denoising/perfect_denoising" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/control_methods/perfect_denoising/perfect_denoising b/target/executable/control_methods/perfect_denoising/perfect_denoising new file mode 100755 index 0000000..7b80fd2 --- /dev/null +++ b/target/executable/control_methods/perfect_denoising/perfect_denoising @@ -0,0 +1,1112 @@ +#!/usr/bin/env bash + +# perfect_denoising 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="perfect_denoising" +VIASH_META_FUNCTIONALITY_NAME="perfect_denoising" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "perfect_denoising 1.0.0" + echo "" + echo "This method serves as a positive control, where the test data is copied 1-to-1" + echo "to the denoised data. This makes it seem as if the data is perfectly denoised as" + echo "it will be compared to the test data in the metrics." + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --input_test" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +LABEL org.opencontainers.image.description="Companion container for running component control_methods perfect_denoising" +LABEL org.opencontainers.image.created="2024-12-19T16:15:28Z" +LABEL org.opencontainers.image.source="https://github.com/openproblems-bio/task_denoising" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "perfect_denoising 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_test) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_test. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_test=*) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test=*\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/control_methods/perfect_denoising:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_TEST+x} ]; then + ViashError '--input_test' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_TEST" ] && [ ! -e "$VIASH_PAR_INPUT_TEST" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TEST' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TEST")" ) + VIASH_PAR_INPUT_TEST=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TEST") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-perfect_denoising-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_test.layers['counts'] + +input_train.uns["method_id"] = meta['name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_PAR_INPUT_TEST=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TEST") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/data_processors/process_dataset/.config.vsh.yaml b/target/executable/data_processors/process_dataset/.config.vsh.yaml new file mode 100644 index 0000000..e8d3e90 --- /dev/null +++ b/target/executable/data_processors/process_dataset/.config.vsh.yaml @@ -0,0 +1,364 @@ +name: "process_dataset" +namespace: "data_processors" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + label: "Common Dataset" + summary: "A subset of the common dataset." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + obs: + - type: "string" + name: "batch" + description: "Batch information" + required: false + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--method" + description: "The process method to assign train/test." + info: null + default: + - "mcv" + required: false + choices: + - "mcv" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--train_frac" + description: "The fraction the molecules need to be split to train dataset" + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "A seed for the subsampling." + info: null + example: + - 123 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_obs_limit" + description: "The maximum number of cells the dataset may have before subsampling\ + \ according to `obs.batch`." + info: null + default: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +description: "Split data using molecular cross-validation.\n\nSplits molecules into\ + \ two (potentially overlapping) groups using a fraction ratio.\nThese are output\ + \ as two separate AnnData objects.\n" +test_resources: +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "file" + path: "resources_test/common/cxg_immune_cell_atlas" + dest: "resources_test/common/cxg_immune_cell_atlas" +info: + type: "process_dataset" + type_info: + label: "Data processor" + summary: "A denoising dataset processor." + description: "A component for processing a Common Dataset into a task-specific\ + \ dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "midcpu" + - "midtime" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "numpy" + - "scipy" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/data_processors/process_dataset/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/data_processors/process_dataset" + executable: "target/executable/data_processors/process_dataset/process_dataset" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/data_processors/process_dataset/helper.py b/target/executable/data_processors/process_dataset/helper.py new file mode 100644 index 0000000..2044ed4 --- /dev/null +++ b/target/executable/data_processors/process_dataset/helper.py @@ -0,0 +1,55 @@ +# MIT License + +# Copyright (c) 2019 Chan Zuckerberg Biohub + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Copied from https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/util.py + + +from typing import Tuple + +import numpy as np + +def split_molecules( + umis: np.ndarray, + data_split: float, + overlap_factor: float = 0.0, + random_state: np.random.RandomState = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Splits molecules into two (potentially overlapping) groups. + :param umis: Array of molecules to split + :param data_split: Proportion of molecules to assign to the first group + :param overlap_factor: Overlap correction factor, if desired + :param random_state: For reproducible sampling + :return: umis_X and umis_Y, representing ``split`` and ``~(1 - split)`` counts + sampled from the input array + """ + if random_state is None: + random_state = np.random.RandomState() + + umis_X_disjoint = random_state.binomial(umis, data_split - overlap_factor) + umis_Y_disjoint = random_state.binomial( + umis - umis_X_disjoint, (1 - data_split) / (1 - data_split + overlap_factor) + ) + overlap_factor = umis - umis_X_disjoint - umis_Y_disjoint + umis_X = umis_X_disjoint + overlap_factor + umis_Y = umis_Y_disjoint + overlap_factor + + return umis_X, umis_Y \ No newline at end of file diff --git a/target/executable/data_processors/process_dataset/process_dataset b/target/executable/data_processors/process_dataset/process_dataset new file mode 100755 index 0000000..9ba26f7 --- /dev/null +++ b/target/executable/data_processors/process_dataset/process_dataset @@ -0,0 +1,1304 @@ +#!/usr/bin/env bash + +# process_dataset 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="process_dataset" +VIASH_META_FUNCTIONALITY_NAME="process_dataset" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "process_dataset 1.0.0" + echo "" + echo "Split data using molecular cross-validation." + echo "" + echo "Splits molecules into two (potentially overlapping) groups using a fraction" + echo "ratio." + echo "These are output as two separate AnnData objects." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" + echo "" + echo " --output_train" + echo " type: file, required parameter, output, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --output_test" + echo " type: file, required parameter, output, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + echo "" + echo " --method" + echo " type: string" + echo " default: mcv" + echo " choices: [ mcv ]" + echo " The process method to assign train/test." + echo "" + echo " --train_frac" + echo " type: double" + echo " default: 0.9" + echo " The fraction the molecules need to be split to train dataset" + echo "" + echo " --seed" + echo " type: integer" + echo " example: 123" + echo " A seed for the subsampling." + echo "" + echo " --n_obs_limit" + echo " type: integer" + echo " default: 10000" + echo " The maximum number of cells the dataset may have before subsampling" + echo " according to \`obs.batch\`." +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "numpy" "scipy" + +LABEL org.opencontainers.image.description="Companion container for running component data_processors process_dataset" +LABEL org.opencontainers.image.created="2024-12-19T16:15:29Z" +LABEL org.opencontainers.image.source="https://github.com/openproblems-bio/task_denoising" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "process_dataset 1.0.0" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_train) + [ -n "$VIASH_PAR_OUTPUT_TRAIN" ] && ViashError Bad arguments for option \'--output_train\': \'$VIASH_PAR_OUTPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_train=*) + [ -n "$VIASH_PAR_OUTPUT_TRAIN" ] && ViashError Bad arguments for option \'--output_train=*\': \'$VIASH_PAR_OUTPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_test) + [ -n "$VIASH_PAR_OUTPUT_TEST" ] && ViashError Bad arguments for option \'--output_test\': \'$VIASH_PAR_OUTPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_TEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_test. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_test=*) + [ -n "$VIASH_PAR_OUTPUT_TEST" ] && ViashError Bad arguments for option \'--output_test=*\': \'$VIASH_PAR_OUTPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_TEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --method) + [ -n "$VIASH_PAR_METHOD" ] && ViashError Bad arguments for option \'--method\': \'$VIASH_PAR_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METHOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --method. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --method=*) + [ -n "$VIASH_PAR_METHOD" ] && ViashError Bad arguments for option \'--method=*\': \'$VIASH_PAR_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METHOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --train_frac) + [ -n "$VIASH_PAR_TRAIN_FRAC" ] && ViashError Bad arguments for option \'--train_frac\': \'$VIASH_PAR_TRAIN_FRAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRAIN_FRAC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --train_frac. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --train_frac=*) + [ -n "$VIASH_PAR_TRAIN_FRAC" ] && ViashError Bad arguments for option \'--train_frac=*\': \'$VIASH_PAR_TRAIN_FRAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRAIN_FRAC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seed) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seed=*) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed=*\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_obs_limit) + [ -n "$VIASH_PAR_N_OBS_LIMIT" ] && ViashError Bad arguments for option \'--n_obs_limit\': \'$VIASH_PAR_N_OBS_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_OBS_LIMIT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_obs_limit. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_obs_limit=*) + [ -n "$VIASH_PAR_N_OBS_LIMIT" ] && ViashError Bad arguments for option \'--n_obs_limit=*\': \'$VIASH_PAR_N_OBS_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_OBS_LIMIT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/data_processors/process_dataset:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_TRAIN+x} ]; then + ViashError '--output_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_TEST+x} ]; then + ViashError '--output_test' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_METHOD+x} ]; then + VIASH_PAR_METHOD="mcv" +fi +if [ -z ${VIASH_PAR_TRAIN_FRAC+x} ]; then + VIASH_PAR_TRAIN_FRAC="0.9" +fi +if [ -z ${VIASH_PAR_N_OBS_LIMIT+x} ]; then + VIASH_PAR_N_OBS_LIMIT="10000" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_TRAIN_FRAC" ]]; then + if ! [[ "$VIASH_PAR_TRAIN_FRAC" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--train_frac' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEED" ]]; then + if ! [[ "$VIASH_PAR_SEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_OBS_LIMIT" ]]; then + if ! [[ "$VIASH_PAR_N_OBS_LIMIT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_obs_limit' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_METHOD" ]; then + VIASH_PAR_METHOD_CHOICES=("mcv") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_METHOD_CHOICES[*]};" =~ ";$VIASH_PAR_METHOD;" ]]; then + ViashError '--method' specified value of \'$VIASH_PAR_METHOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_TRAIN" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_TRAIN")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_TRAIN")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TEST" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_TEST")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_TEST")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_TRAIN")" ) + VIASH_PAR_OUTPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_TRAIN") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_TRAIN" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_TEST")" ) + VIASH_PAR_OUTPUT_TEST=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_TEST") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_TEST" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-process_dataset-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import random +import anndata as ad +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_train': $( if [ ! -z ${VIASH_PAR_OUTPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_test': $( if [ ! -z ${VIASH_PAR_OUTPUT_TEST+x} ]; then echo "r'${VIASH_PAR_OUTPUT_TEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'method': $( if [ ! -z ${VIASH_PAR_METHOD+x} ]; then echo "r'${VIASH_PAR_METHOD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'train_frac': $( if [ ! -z ${VIASH_PAR_TRAIN_FRAC+x} ]; then echo "float(r'${VIASH_PAR_TRAIN_FRAC//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_obs_limit': $( if [ ! -z ${VIASH_PAR_N_OBS_LIMIT+x} ]; then echo "int(r'${VIASH_PAR_N_OBS_LIMIT//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from helper import split_molecules + +# set random state +random_state = np.random.RandomState(par['seed']) + +print(">> Load Data", flush=True) +adata = ad.read_h5ad(par["input"]) + +# limit to max number of observations +adata_output = adata.copy() + +if "batch" in adata.obs: + print(f">> Subsampling observations by largest batch", flush=True) + batch_counts = adata.obs.groupby('batch').size() + sorted_batches = batch_counts.sort_values(ascending=False) + selected_batch = sorted_batches.index[0] + adata_output = adata[adata.obs["batch"]==selected_batch,:].copy() + +if adata_output.n_obs > par["n_obs_limit"]: + print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True) + print(f">> Setting seed to {par['seed']}", flush=True) + random.seed(par["seed"]) + obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs) + obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False) + adata_output = adata_output[obs_index].copy() + +# remove all layers except for counts +print(">> Remove all layers except for counts", flush=True) +for key in list(adata_output.layers.keys()): + if key != "counts": + del adata_output.layers[key] + +# round counts and convert to int +print(">> Round counts and convert to int", flush=True) +counts = np.array(adata_output.layers["counts"]).round().astype(int) + +print(">> process and split data", flush=True) +train_data, test_data = split_molecules( + counts.data, par["train_frac"], 0.0, random_state +) + +X_train = counts.copy() +X_test = counts.copy() +X_train.data = train_data +X_test.data = test_data +X_train.eliminate_zeros() +X_test.eliminate_zeros() + +# copy adata to train_set, test_set +print(">> Create AnnData output objects", flush=True) +train_uns_keys = ["dataset_id", "dataset_organism"] +output_train = ad.AnnData( + layers={"counts": X_train}, + obs=adata_output.obs[[]], + var=adata_output.var[[]], + uns={key: adata_output.uns[key] for key in train_uns_keys} +) +test_uns_keys = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +output_test = ad.AnnData( + layers={"counts": X_test}, + obs=adata_output.obs[[]], + var=adata_output.var[[]], + uns={key: adata_output.uns[key] for key in test_uns_keys} +) + +# add additional information for the train set +output_test.uns["train_sum"] = X_train.sum() + +# Remove no cells that do not have enough reads +print(">> Remove cells that do not have enough reads", flush=True) +is_missing = np.array(X_train.sum(axis=0) == 0) + +output_train = output_train[:, ~is_missing.flatten()] +output_test = output_test[:, ~is_missing.flatten()] + +print(">> Write to file", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_TRAIN" ]; then + VIASH_PAR_OUTPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_TEST" ]; then + VIASH_PAR_OUTPUT_TEST=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_TEST") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_OUTPUT_TRAIN" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_TRAIN' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TEST" ] && [ ! -e "$VIASH_PAR_OUTPUT_TEST" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_TEST' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/methods/alra/.config.vsh.yaml b/target/executable/methods/alra/.config.vsh.yaml new file mode 100644 index 0000000..d5800cd --- /dev/null +++ b/target/executable/methods/alra/.config.vsh.yaml @@ -0,0 +1,278 @@ +name: "alra" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--norm" + description: "Normalization method" + info: null + default: + - "log" + required: false + choices: + - "sqrt" + - "log" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +label: "ALRA" +summary: "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation,\ + \ thresholding by gene, and rescaling the matrix." +description: "Adaptively-thresholded Low Rank Approximation (ALRA). \n\nALRA is a\ + \ method for imputation of missing values in single cell RNA-sequencing data, \n\ + described in the preprint, \"Zero-preserving imputation of scRNA-seq data using\ + \ low-rank approximation\" \navailable [here](https://www.biorxiv.org/content/early/2018/08/22/397588).\ + \ Given a \nscRNA-seq expression matrix, ALRA first computes its rank-k approximation\ + \ using randomized SVD. \nNext, each row (gene) is thresholded by the magnitude\ + \ of the most negative value of that gene. \nFinally, the matrix is rescaled.\n" +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/alra.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1101/397588" +links: + repository: "https://github.com/KlugerLab/ALRA" + docker_registry: "ghcr.io" + documentation: "https://github.com/KlugerLab/ALRA/blob/master/README.md" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_r:1.0.0" + namespace_separator: "/" + setup: + - type: "r" + cran: + - "Matrix" + - "rsvd" + github: + - "KlugerLab/ALRA" + bioc_force_install: false + entrypoint: [] + cmd: null +build_info: + config: "src/methods/alra/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/methods/alra" + executable: "target/executable/methods/alra/alra" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/methods/alra/alra b/target/executable/methods/alra/alra new file mode 100755 index 0000000..bfce083 --- /dev/null +++ b/target/executable/methods/alra/alra @@ -0,0 +1,1168 @@ +#!/usr/bin/env bash + +# alra 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="alra" +VIASH_META_FUNCTIONALITY_NAME="alra" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "alra 1.0.0" + echo "" + echo "Adaptively-thresholded Low Rank Approximation (ALRA)." + echo "" + echo "ALRA is a method for imputation of missing values in single cell RNA-sequencing" + echo "data," + echo "described in the preprint, \"Zero-preserving imputation of scRNA-seq data using" + echo "low-rank approximation\"" + echo "available [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given" + echo "a" + echo "scRNA-seq expression matrix, ALRA first computes its rank-k approximation using" + echo "randomized SVD." + echo "Next, each row (gene) is thresholded by the magnitude of the most negative value" + echo "of that gene." + echo "Finally, the matrix is rescaled." + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + echo "" + echo " --norm" + echo " type: string" + echo " default: log" + echo " choices: [ sqrt, log ]" + echo " Normalization method" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_r:1.0.0 +ENTRYPOINT [] +RUN Rscript -e 'if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'remotes::install_cran(c("Matrix", "rsvd"), repos = "https://cran.rstudio.com")' && \ + Rscript -e 'remotes::install_github(c("KlugerLab/ALRA"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.description="Companion container for running component methods alra" +LABEL org.opencontainers.image.created="2024-12-19T16:15:28Z" +LABEL org.opencontainers.image.source="https://github.com/KlugerLab/ALRA" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "alra 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --norm) + [ -n "$VIASH_PAR_NORM" ] && ViashError Bad arguments for option \'--norm\': \'$VIASH_PAR_NORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NORM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --norm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --norm=*) + [ -n "$VIASH_PAR_NORM" ] && ViashError Bad arguments for option \'--norm=*\': \'$VIASH_PAR_NORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NORM=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/methods/alra:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_NORM+x} ]; then + VIASH_PAR_NORM="log" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_NORM" ]; then + VIASH_PAR_NORM_CHOICES=("sqrt;log") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_NORM_CHOICES[*]};" =~ ";$VIASH_PAR_NORM;" ]]; then + ViashError '--norm' specified value of \'$VIASH_PAR_NORM\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-alra-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +cat(">> Loading dependencies\\n") +library(anndata, warn.conflicts = FALSE) +library(ALRA, warn.conflicts = FALSE) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input_train" = $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_TRAIN" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "norm" = $( if [ ! -z ${VIASH_PAR_NORM+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_NORM" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +cat(">> Load input data\\n") +input_train <- read_h5ad(par\$input_train) + +cat(">> Set normalization method\\n") +if (par\$norm == "sqrt") { + norm_fn <- sqrt + denorm_fn <- function(x) x^2 +} else if (par\$norm == "log") { + norm_fn <- log1p + denorm_fn <- expm1 +} else { + stop("Unknown normalization method: ", par\$norm) +} + +cat(">> Normalize data\\n") +data <- as.matrix(input_train\$layers[["counts"]]) +totalPerCell <- rowSums(data) +data <- sweep(data, 1, totalPerCell, "/") +data <- norm_fn(data) + +cat(">> Run ALRA\\n") +data <- alra(data)\$A_norm_rank_k_cor_sc +data <- denorm_fn(data) +data <- sweep(data, 1, totalPerCell, "*") + +cat(">> Store output\\n") +output <- AnnData( + layers = list(denoised = data), + obs = input_train\$obs[, c(), drop = FALSE], + var = input_train\$var[, c(), drop = FALSE], + uns = list( + dataset_id = input_train\$uns[["dataset_id"]], + method_id = meta\$name + ) +) + +cat(">> Write output to file\\n") +output\$write_h5ad(par\$output, compression = "gzip") +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/methods/dca/.config.vsh.yaml b/target/executable/methods/dca/.config.vsh.yaml new file mode 100644 index 0000000..1a6bb33 --- /dev/null +++ b/target/executable/methods/dca/.config.vsh.yaml @@ -0,0 +1,285 @@ +name: "dca" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--epochs" + description: "Number of total epochs in training" + info: null + default: + - 300 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "DCA" +summary: "A deep autoencoder with ZINB loss function to address the dropout effect\ + \ in count data" +description: "\"Deep Count Autoencoder\n\nRemoves the dropout effect by taking the\ + \ count structure, overdispersed nature and sparsity of the data into account \n\ + using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function.\"\ + \n" +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/dca.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1038/s41467-018-07931-2" +links: + repository: "https://github.com/theislab/dca" + docker_registry: "ghcr.io" + documentation: "https://github.com/theislab/dca#readme" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.9" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + github: + - "openproblems-bio/core#subdirectory=packages/python/openproblems" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.8.0" + - "scanpy" + - "pyyaml" + - "requests" + - "jsonschema" + - "git+https://github.com/scottgigante-immunai/dca.git@patch-1" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/dca/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/methods/dca" + executable: "target/executable/methods/dca/dca" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/methods/dca/dca b/target/executable/methods/dca/dca new file mode 100755 index 0000000..bfcda1b --- /dev/null +++ b/target/executable/methods/dca/dca @@ -0,0 +1,1137 @@ +#!/usr/bin/env bash + +# dca 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="dca" +VIASH_META_FUNCTIONALITY_NAME="dca" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "dca 1.0.0" + echo "" + echo "\"Deep Count Autoencoder" + echo "" + echo "Removes the dropout effect by taking the count structure, overdispersed nature" + echo "and sparsity of the data into account" + echo "using a deep autoencoder with zero-inflated negative binomial (ZINB) loss" + echo "function.\"" + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + echo "" + echo " --epochs" + echo " type: integer" + echo " default: 300" + echo " Number of total epochs in training" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.9 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.8.0" "scanpy" "pyyaml" "requests" "jsonschema" "git+https://github.com/scottgigante-immunai/dca.git@patch-1" "numpy<2" + +LABEL org.opencontainers.image.description="Companion container for running component methods dca" +LABEL org.opencontainers.image.created="2024-12-19T16:15:29Z" +LABEL org.opencontainers.image.source="https://github.com/theislab/dca" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "dca 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --epochs) + [ -n "$VIASH_PAR_EPOCHS" ] && ViashError Bad arguments for option \'--epochs\': \'$VIASH_PAR_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --epochs=*) + [ -n "$VIASH_PAR_EPOCHS" ] && ViashError Bad arguments for option \'--epochs=*\': \'$VIASH_PAR_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/methods/dca:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_EPOCHS+x} ]; then + VIASH_PAR_EPOCHS="300" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-dca-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import anndata as ad +from dca.api import dca + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'epochs': $( if [ ! -z ${VIASH_PAR_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Remove unneeded data", flush=True) +output = ad.AnnData( + X=input_train.layers["counts"], + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Run DCA", flush=True) +dca(output, epochs=par["epochs"]) + +print("Move output to correct location", flush=True) +output.layers["denoised"] = output.X +del output.X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/methods/knn_smoothing/.config.vsh.yaml b/target/executable/methods/knn_smoothing/.config.vsh.yaml new file mode 100644 index 0000000..e210945 --- /dev/null +++ b/target/executable/methods/knn_smoothing/.config.vsh.yaml @@ -0,0 +1,268 @@ +name: "knn_smoothing" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "KNN Smoothing" +summary: "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing\ + \ the size of neighbourhoods for smoothing until a maximum k value is reached." +description: "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq\ + \ expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first\ + \ applies initial normalisation and smoothing. Then, a chosen number of principal\ + \ components is used to calculate Euclidean distances between cells. Minimally sized\ + \ neighbourhoods are initially determined from these Euclidean distances, and expression\ + \ profiles are shared between neighbouring cells. Then, the resultant smoothed matrix\ + \ is used as input to the next step of smoothing, where the size (k) of the considered\ + \ neighbourhoods is increased, leading to greater smoothing. This process continues\ + \ until a chosen maximum k value has been reached, at which point the iteratively\ + \ smoothed object is then optionally scaled to yield a final result." +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/knn_smoothing.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1101/217737" +links: + repository: "https://github.com/yanailab/knn-smoothing" + docker_registry: "ghcr.io" + documentation: "https://github.com/yanailab/knn-smoothing#readme" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "scipy" + github: + - "scottgigante-immunai/knn-smoothing@python_package" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/knn_smoothing/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/methods/knn_smoothing" + executable: "target/executable/methods/knn_smoothing/knn_smoothing" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/methods/knn_smoothing/knn_smoothing b/target/executable/methods/knn_smoothing/knn_smoothing new file mode 100755 index 0000000..236c75c --- /dev/null +++ b/target/executable/methods/knn_smoothing/knn_smoothing @@ -0,0 +1,1109 @@ +#!/usr/bin/env bash + +# knn_smoothing 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="knn_smoothing" +VIASH_META_FUNCTIONALITY_NAME="knn_smoothing" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "knn_smoothing 1.0.0" + echo "" + echo "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq" + echo "expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first" + echo "applies initial normalisation and smoothing. Then, a chosen number of principal" + echo "components is used to calculate Euclidean distances between cells. Minimally" + echo "sized neighbourhoods are initially determined from these Euclidean distances," + echo "and expression profiles are shared between neighbouring cells. Then, the" + echo "resultant smoothed matrix is used as input to the next step of smoothing, where" + echo "the size (k) of the considered neighbourhoods is increased, leading to greater" + echo "smoothing. This process continues until a chosen maximum k value has been" + echo "reached, at which point the iteratively smoothed object is then optionally" + echo "scaled to yield a final result." + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scipy" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/scottgigante-immunai/knn-smoothing@python_package" + +LABEL org.opencontainers.image.description="Companion container for running component methods knn_smoothing" +LABEL org.opencontainers.image.created="2024-12-19T16:15:29Z" +LABEL org.opencontainers.image.source="https://github.com/yanailab/knn-smoothing" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "knn_smoothing 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/methods/knn_smoothing:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-knn_smoothing-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import knn_smooth +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par["input_train"]) + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"].astype(float).transpose().toarray() + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Run KNN smoothing", flush=True) +X = knn_smooth.knn_smoothing(X, k=10).transpose() + +print("Process data", flush=True) +output.layers["denoised"] = X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/methods/magic/.config.vsh.yaml b/target/executable/methods/magic/.config.vsh.yaml new file mode 100644 index 0000000..7b827be --- /dev/null +++ b/target/executable/methods/magic/.config.vsh.yaml @@ -0,0 +1,323 @@ +name: "magic" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--solver" + description: "Which solver to use." + info: null + default: + - "exact" + required: false + choices: + - "exact" + - "approximate" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--norm" + description: "Normalization method" + info: null + default: + - "log" + required: false + choices: + - "sqrt" + - "log" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--decay" + description: "sets decay rate of kernel tails" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--t" + description: "power to which the diffusion operator is powered" + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "MAGIC" +summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone." +description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method\ + \ for imputation and denoising of noisy or dropout-prone single cell RNA-sequencing\ + \ data. Given a normalised scRNA-seq expression matrix, it first calculates Euclidean\ + \ distances between each pair of cells in the dataset, which is then augmented using\ + \ a Gaussian kernel (function) and row-normalised to give a normalised affinity\ + \ matrix. A t-step markov process is then calculated, by powering this affinity\ + \ matrix t times. Finally, the powered affinity matrix is right-multiplied by the\ + \ normalised data, causing the final imputed values to take the value of a per-gene\ + \ average weighted by the affinities of cells. The resultant imputed matrix is then\ + \ rescaled, to more closely match the magnitude of measurements in the normalised\ + \ (input) matrix." +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/magic.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + variants: + magic_approx: + solver: "approximate" + magic_knn_naive: + norm: "log" + decay: "none" + t: 1 + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1016/j.cell.2018.05.061" +links: + repository: "https://github.com/KrishnaswamyLab/MAGIC" + docker_registry: "ghcr.io" + documentation: "https://github.com/KrishnaswamyLab/MAGIC#readme" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "scprep" + - "magic-impute" + - "scipy" + - "scikit-learn<1.2" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/magic/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/methods/magic" + executable: "target/executable/methods/magic/magic" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/methods/magic/magic b/target/executable/methods/magic/magic new file mode 100755 index 0000000..3ac1259 --- /dev/null +++ b/target/executable/methods/magic/magic @@ -0,0 +1,1261 @@ +#!/usr/bin/env bash + +# magic 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="magic" +VIASH_META_FUNCTIONALITY_NAME="magic" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "magic 1.0.0" + echo "" + echo "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for" + echo "imputation and denoising of noisy or dropout-prone single cell RNA-sequencing" + echo "data. Given a normalised scRNA-seq expression matrix, it first calculates" + echo "Euclidean distances between each pair of cells in the dataset, which is then" + echo "augmented using a Gaussian kernel (function) and row-normalised to give a" + echo "normalised affinity matrix. A t-step markov process is then calculated, by" + echo "powering this affinity matrix t times. Finally, the powered affinity matrix is" + echo "right-multiplied by the normalised data, causing the final imputed values to" + echo "take the value of a per-gene average weighted by the affinities of cells. The" + echo "resultant imputed matrix is then rescaled, to more closely match the magnitude" + echo "of measurements in the normalised (input) matrix." + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + echo "" + echo " --solver" + echo " type: string" + echo " default: exact" + echo " choices: [ exact, approximate ]" + echo " Which solver to use." + echo "" + echo " --norm" + echo " type: string" + echo " default: log" + echo " choices: [ sqrt, log ]" + echo " Normalization method" + echo "" + echo " --decay" + echo " type: integer" + echo " default: 1" + echo " sets decay rate of kernel tails" + echo "" + echo " --t" + echo " type: integer" + echo " default: 3" + echo " power to which the diffusion operator is powered" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scprep" "magic-impute" "scipy" "scikit-learn<1.2" "numpy<2" + +LABEL org.opencontainers.image.description="Companion container for running component methods magic" +LABEL org.opencontainers.image.created="2024-12-19T16:15:29Z" +LABEL org.opencontainers.image.source="https://github.com/KrishnaswamyLab/MAGIC" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "magic 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --solver) + [ -n "$VIASH_PAR_SOLVER" ] && ViashError Bad arguments for option \'--solver\': \'$VIASH_PAR_SOLVER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLVER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --solver. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --solver=*) + [ -n "$VIASH_PAR_SOLVER" ] && ViashError Bad arguments for option \'--solver=*\': \'$VIASH_PAR_SOLVER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLVER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --norm) + [ -n "$VIASH_PAR_NORM" ] && ViashError Bad arguments for option \'--norm\': \'$VIASH_PAR_NORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NORM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --norm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --norm=*) + [ -n "$VIASH_PAR_NORM" ] && ViashError Bad arguments for option \'--norm=*\': \'$VIASH_PAR_NORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NORM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --decay) + [ -n "$VIASH_PAR_DECAY" ] && ViashError Bad arguments for option \'--decay\': \'$VIASH_PAR_DECAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DECAY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --decay. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --decay=*) + [ -n "$VIASH_PAR_DECAY" ] && ViashError Bad arguments for option \'--decay=*\': \'$VIASH_PAR_DECAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DECAY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --t) + [ -n "$VIASH_PAR_T" ] && ViashError Bad arguments for option \'--t\': \'$VIASH_PAR_T\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_T="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --t. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --t=*) + [ -n "$VIASH_PAR_T" ] && ViashError Bad arguments for option \'--t=*\': \'$VIASH_PAR_T\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_T=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/methods/magic:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_SOLVER+x} ]; then + VIASH_PAR_SOLVER="exact" +fi +if [ -z ${VIASH_PAR_NORM+x} ]; then + VIASH_PAR_NORM="log" +fi +if [ -z ${VIASH_PAR_DECAY+x} ]; then + VIASH_PAR_DECAY="1" +fi +if [ -z ${VIASH_PAR_T+x} ]; then + VIASH_PAR_T="3" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_DECAY" ]]; then + if ! [[ "$VIASH_PAR_DECAY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--decay' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_T" ]]; then + if ! [[ "$VIASH_PAR_T" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--t' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_SOLVER" ]; then + VIASH_PAR_SOLVER_CHOICES=("exact;approximate") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SOLVER_CHOICES[*]};" =~ ";$VIASH_PAR_SOLVER;" ]]; then + ViashError '--solver' specified value of \'$VIASH_PAR_SOLVER\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_NORM" ]; then + VIASH_PAR_NORM_CHOICES=("sqrt;log") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_NORM_CHOICES[*]};" =~ ";$VIASH_PAR_NORM;" ]]; then + ViashError '--norm' specified value of \'$VIASH_PAR_NORM\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-magic-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import anndata as ad +import numpy as np +import scprep +from magic import MAGIC +import scipy + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'solver': $( if [ ! -z ${VIASH_PAR_SOLVER+x} ]; then echo "r'${VIASH_PAR_SOLVER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'norm': $( if [ ! -z ${VIASH_PAR_NORM+x} ]; then echo "r'${VIASH_PAR_NORM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'decay': $( if [ ! -z ${VIASH_PAR_DECAY+x} ]; then echo "int(r'${VIASH_PAR_DECAY//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 't': $( if [ ! -z ${VIASH_PAR_T+x} ]; then echo "int(r'${VIASH_PAR_T//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par["input_train"]) + +print("Set normalization method", flush=True) +if par["norm"] == "sqrt": + norm_fn = np.sqrt + denorm_fn = np.square +elif par["norm"] == "log": + norm_fn = np.log1p + denorm_fn = np.expm1 +else: + raise ValueError("Unknown normalization method: " + par["norm"] + ".") + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"] + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Normalize data", flush=True) +X, libsize = scprep.normalize.library_size_normalize( + X, + rescale=1, + return_library_size=True +) +X = scprep.utils.matrix_transform(X, norm_fn) + +print("Run MAGIC", flush=True) +magic = MAGIC( + solver=par["solver"], + decay=par["decay"], + t=par["t"], + verbose=False, +) +X = magic.fit_transform(X, genes="all_genes") + +print("Denormalizing data", flush=True) +X = scprep.utils.matrix_transform(X, denorm_fn) +X = scprep.utils.matrix_vector_elementwise_multiply(X, libsize, axis=0) + +print("Create output AnnData", flush=True) +output.layers["denoised"] = X + +print("Write Data", flush=True) +output.write_h5ad(par["output"], compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/methods/scprint/.config.vsh.yaml b/target/executable/methods/scprint/.config.vsh.yaml new file mode 100644 index 0000000..939b833 --- /dev/null +++ b/target/executable/methods/scprint/.config.vsh.yaml @@ -0,0 +1,299 @@ +name: "scprint" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--model_name" + description: "Which model to use. Not used if --model is provided." + info: null + default: + - "large" + required: false + choices: + - "large" + - "medium" + - "small" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model" + description: "Path to the scPRINT model." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "scPRINT" +summary: "scPRINT is a large transformer model built for the inference of gene networks" +description: "scPRINT is a large transformer model built for the inference of gene\ + \ networks\n(connections between genes explaining the cell's expression profile)\ + \ from\nscRNAseq data.\n\nIt uses novel encoding and decoding of the cell expression\ + \ profile and new\npre-training methodologies to learn a cell model.\n\nscPRINT\ + \ can be used to perform the following analyses:\n\n- expression denoising: increase\ + \ the resolution of your scRNAseq data\n- cell embedding: generate a low-dimensional\ + \ representation of your dataset\n- label prediction: predict the cell type, disease,\ + \ sequencer, sex, and\n ethnicity of your cells\n- gene network inference: generate\ + \ a gene network from any cell or cell\n cluster in your scRNAseq dataset\n" +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +info: + preferred_normalization: "counts" + variants: + scprint_large: + model_name: "large" + scprint_medium: + model_name: "medium" + scprint_small: + model_name: "small" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1101/2024.07.29.605556" +links: + repository: "https://github.com/cantinilab/scPRINT" + docker_registry: "ghcr.io" + documentation: "https://cantinilab.github.io/scPRINT/" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_pytorch_nvidia:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "huggingface_hub" + - "scprint" + upgrade: true + - type: "docker" + run: + - "lamin init --storage ./main --name main --schema bionty" + - type: "python" + user: false + script: + - "import bionty as bt; bt.core.sync_all_sources_to_latest()" + upgrade: true + - type: "docker" + run: + - "lamin load anonymous/main" + - type: "python" + user: false + script: + - "from scdataloader.utils import populate_my_ontology; populate_my_ontology()" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/scprint/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/methods/scprint" + executable: "target/executable/methods/scprint/scprint" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/methods/scprint/scprint b/target/executable/methods/scprint/scprint new file mode 100755 index 0000000..415e5df --- /dev/null +++ b/target/executable/methods/scprint/scprint @@ -0,0 +1,1259 @@ +#!/usr/bin/env bash + +# scprint 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scprint" +VIASH_META_FUNCTIONALITY_NAME="scprint" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scprint 1.0.0" + echo "" + echo "scPRINT is a large transformer model built for the inference of gene networks" + echo "(connections between genes explaining the cell's expression profile) from" + echo "scRNAseq data." + echo "" + echo "It uses novel encoding and decoding of the cell expression profile and new" + echo "pre-training methodologies to learn a cell model." + echo "" + echo "scPRINT can be used to perform the following analyses:" + echo "" + echo "- expression denoising: increase the resolution of your scRNAseq data" + echo "- cell embedding: generate a low-dimensional representation of your dataset" + echo "- label prediction: predict the cell type, disease, sequencer, sex, and" + echo " ethnicity of your cells" + echo "- gene network inference: generate a gene network from any cell or cell" + echo " cluster in your scRNAseq dataset" + echo "" + echo "Arguments:" + echo " --input_train" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + echo "" + echo " --model_name" + echo " type: string" + echo " default: large" + echo " choices: [ large, medium, small ]" + echo " Which model to use. Not used if --model is provided." + echo "" + echo " --model" + echo " type: file, file must exist" + echo " Path to the scPRINT model." +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_pytorch_nvidia:1.0.0 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "huggingface_hub" "scprint" + +RUN lamin init --storage ./main --name main --schema bionty +RUN pip install --upgrade pip && \ + python -c 'import bionty as bt; bt.core.sync_all_sources_to_latest()' + +RUN lamin load anonymous/main +RUN pip install --upgrade pip && \ + python -c 'from scdataloader.utils import populate_my_ontology; populate_my_ontology()' + +LABEL org.opencontainers.image.description="Companion container for running component methods scprint" +LABEL org.opencontainers.image.created="2024-12-19T16:15:29Z" +LABEL org.opencontainers.image.source="https://github.com/cantinilab/scPRINT" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scprint 1.0.0" + exit + ;; + --input_train) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_train. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_train=*) + [ -n "$VIASH_PAR_INPUT_TRAIN" ] && ViashError Bad arguments for option \'--input_train=*\': \'$VIASH_PAR_INPUT_TRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TRAIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model_name) + [ -n "$VIASH_PAR_MODEL_NAME" ] && ViashError Bad arguments for option \'--model_name\': \'$VIASH_PAR_MODEL_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_name=*) + [ -n "$VIASH_PAR_MODEL_NAME" ] && ViashError Bad arguments for option \'--model_name=*\': \'$VIASH_PAR_MODEL_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/methods/scprint:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then + ViashError '--input_train' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODEL_NAME+x} ]; then + VIASH_PAR_MODEL_NAME="large" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ] && [ ! -e "$VIASH_PAR_INPUT_TRAIN" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TRAIN' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL" ] && [ ! -e "$VIASH_PAR_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_MODEL' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_MODEL_NAME" ]; then + VIASH_PAR_MODEL_NAME_CHOICES=("large;medium;small") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MODEL_NAME_CHOICES[*]};" =~ ";$VIASH_PAR_MODEL_NAME;" ]]; then + ViashError '--model_name' specified value of \'$VIASH_PAR_MODEL_NAME\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TRAIN")" ) + VIASH_PAR_INPUT_TRAIN=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TRAIN") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL")" ) + VIASH_PAR_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scprint-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import os + +import anndata as ad +import scprint +import torch +from huggingface_hub import hf_hub_download +from scdataloader import Preprocessor +from scprint import scPrint +from scprint.tasks import Denoiser +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_name': $( if [ ! -z ${VIASH_PAR_MODEL_NAME+x} ]; then echo "r'${VIASH_PAR_MODEL_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print(f"====== scPRINT version {scprint.__version__} ======", flush=True) + +print("\\n>>> Reading input data...", flush=True) +input = ad.read_h5ad(par["input_train"]) +print(input) + +print("\\n>>> Preprocessing data...", flush=True) +adata = ad.AnnData( + X=input.layers["counts"] +) +adata.obs_names = input.obs_names +adata.var_names = input.var_names +if input.uns["dataset_organism"] == "homo_sapiens": + adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606" +elif input.uns["dataset_organism"] == "mus_musculus": + adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090" +else: + raise ValueError( + f"scPRINT requires human or mouse data, not '{input.uns['dataset_organism']}'" + ) + +preprocessor = Preprocessor( + # Lower this threshold for test datasets + min_valid_genes_id=1000 if input.n_vars < 2000 else 10000, + # Turn off cell filtering to return results for all cells + filter_cell_by_counts=False, + min_nnz_genes=False, + do_postp=False, + # Skip ontology checks + skip_validate=True, +) +adata = preprocessor(adata) +print(adata) + +model_checkpoint_file = par["model"] +if model_checkpoint_file is None: + print(f"\\n>>> Downloading '{par['model_name']}' model...", flush=True) + model_checkpoint_file = hf_hub_download( + repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt" + ) +print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True) +model = scPrint.load_from_checkpoint( + model_checkpoint_file, + transformer="normal", # Don't use this for GPUs with flashattention + precpt_gene_emb=None, +) + +print("\\n>>> Denoising data...", flush=True) +if torch.cuda.is_available(): + print("CUDA is available, using GPU", flush=True) + precision = "16-mixed" + dtype = torch.float16 +else: + print("CUDA is not available, using CPU", flush=True) + precision = "32" + dtype = torch.float32 +n_cores_available = len(os.sched_getaffinity(0)) +print(f"Using {n_cores_available} worker cores") +denoiser = Denoiser( + num_workers=n_cores_available, + precision=precision, + max_cells=adata.n_obs, + doplot=False, + dtype=dtype, +) +_, idxs, genes, expr_pred = denoiser(model, adata) +print(f"Predicted expression dimensions: {expr_pred.shape}") + +print("\\n>>> Applying denoising...", flush=True) +adata.X = adata.X.tolil() +idxs = idxs if idxs is not None else range(adata.shape[0]) +for i, idx in enumerate(idxs): + adata.X[idx, adata.var.index.get_indexer(genes)] = expr_pred[i] +adata.X = adata.X.tocsr() +print(adata) + +print("\\n>>> Storing output...", flush=True) +output = ad.AnnData( + layers={ + "denoised": adata.X[:, adata.var.index.get_indexer(input.var_names)], + }, + obs=input.obs[[]], + var=input.var[[]], + uns={ + "dataset_id": input.uns["dataset_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\\n>>> Writing output AnnData to file...", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\\n>>> Done!", flush=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TRAIN" ]; then + VIASH_PAR_INPUT_TRAIN=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TRAIN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metrics/mse/.config.vsh.yaml b/target/executable/metrics/mse/.config.vsh.yaml new file mode 100644 index 0000000..983a0c5 --- /dev/null +++ b/target/executable/metrics/mse/.config.vsh.yaml @@ -0,0 +1,317 @@ +name: "mse" +namespace: "metrics" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_prediction" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Score" + summary: "File indicating the score of a metric." + info: + format: + type: "h5ad" + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + - type: "string" + name: "metric_ids" + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: "double" + name: "metric_values" + description: "The metric values obtained for the given prediction. Must\ + \ be of same length as 'metric_ids'." + multiple: true + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + metrics: + - name: "mse" + label: "Mean-squared error" + summary: "The mean squared error between the denoised counts and the true counts." + description: "The mean squared error between the denoised counts of the training\ + \ dataset and the true counts of the test dataset after reweighing by the train/test\ + \ ratio" + references: + doi: "10.1101/786269" + v1: + path: "openproblems/tasks/denoising/metrics/mse.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + maximize: false + min: 0 + max: "+.inf" + type: "metric" + type_info: + label: "Metric" + summary: "A metric." + description: "A metric for evaluating denoised datasets.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pypi: + - "scikit-learn" + - "scprep" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/metrics/mse/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/metrics/mse" + executable: "target/executable/metrics/mse/mse" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/metrics/mse/mse b/target/executable/metrics/mse/mse new file mode 100755 index 0000000..d18696b --- /dev/null +++ b/target/executable/metrics/mse/mse @@ -0,0 +1,1137 @@ +#!/usr/bin/env bash + +# mse 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="mse" +VIASH_META_FUNCTIONALITY_NAME="mse" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "mse 1.0.0" + echo "" + echo "Arguments:" + echo " --input_test" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + echo "" + echo " --input_prediction" + echo " type: file, required parameter, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scikit-learn" "scprep" "numpy<2" + +LABEL org.opencontainers.image.description="Companion container for running component metrics mse" +LABEL org.opencontainers.image.created="2024-12-19T16:15:28Z" +LABEL org.opencontainers.image.source="https://github.com/openproblems-bio/task_denoising" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "mse 1.0.0" + exit + ;; + --input_test) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_test. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_test=*) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test=*\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_prediction) + [ -n "$VIASH_PAR_INPUT_PREDICTION" ] && ViashError Bad arguments for option \'--input_prediction\': \'$VIASH_PAR_INPUT_PREDICTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_PREDICTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_prediction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_prediction=*) + [ -n "$VIASH_PAR_INPUT_PREDICTION" ] && ViashError Bad arguments for option \'--input_prediction=*\': \'$VIASH_PAR_INPUT_PREDICTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_PREDICTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/metrics/mse:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TEST+x} ]; then + ViashError '--input_test' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_PREDICTION+x} ]; then + ViashError '--input_prediction' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TEST" ] && [ ! -e "$VIASH_PAR_INPUT_TEST" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TEST' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_PREDICTION" ] && [ ! -e "$VIASH_PAR_INPUT_PREDICTION" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_PREDICTION' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TEST")" ) + VIASH_PAR_INPUT_TEST=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TEST") +fi +if [ ! -z "$VIASH_PAR_INPUT_PREDICTION" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_PREDICTION")" ) + VIASH_PAR_INPUT_PREDICTION=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_PREDICTION") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-mse-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import anndata as ad +import scanpy as sc +import sklearn.metrics +import scprep + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_prediction': $( if [ ! -z ${VIASH_PAR_INPUT_PREDICTION+x} ]; then echo "r'${VIASH_PAR_INPUT_PREDICTION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load data", flush=True) +input_denoised = ad.read_h5ad(par['input_prediction']) +input_test = ad.read_h5ad(par['input_test']) + +test_data = ad.AnnData(X=input_test.layers["counts"]) +denoised_data = ad.AnnData(X=input_denoised.layers["denoised"]) + +print("Normalize data", flush=True) + +# scaling and transformation +target_sum = 10000 + +sc.pp.normalize_total(test_data, target_sum=target_sum) +sc.pp.log1p(test_data) + +sc.pp.normalize_total(denoised_data, target_sum=target_sum) +sc.pp.log1p(denoised_data) + +print("Compute mse value", flush=True) +error = sklearn.metrics.mean_squared_error( + scprep.utils.toarray(test_data.X), scprep.utils.toarray(denoised_data.X) +) + +print("Store mse value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_PAR_INPUT_TEST=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TEST") + fi + if [ ! -z "$VIASH_PAR_INPUT_PREDICTION" ]; then + VIASH_PAR_INPUT_PREDICTION=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_PREDICTION") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metrics/poisson/.config.vsh.yaml b/target/executable/metrics/poisson/.config.vsh.yaml new file mode 100644 index 0000000..5e62256 --- /dev/null +++ b/target/executable/metrics/poisson/.config.vsh.yaml @@ -0,0 +1,316 @@ +name: "poisson" +namespace: "metrics" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_prediction" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Score" + summary: "File indicating the score of a metric." + info: + format: + type: "h5ad" + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + - type: "string" + name: "metric_ids" + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: "double" + name: "metric_values" + description: "The metric values obtained for the given prediction. Must\ + \ be of same length as 'metric_ids'." + multiple: true + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + metrics: + - name: "poisson" + label: "Poisson Loss" + summary: "The Poisson log likelihood of the true counts observed in the distribution\ + \ of denoised counts" + description: "The Poisson log likelihood of observing the true counts of the test\ + \ dataset given the distribution given in the denoised dataset." + references: + doi: "10.1101/786269" + v1: + path: "openproblems/tasks/denoising/metrics/poisson.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + maximize: false + min: 0 + max: "+.inf" + type: "metric" + type_info: + label: "Metric" + summary: "A metric." + description: "A metric for evaluating denoised datasets.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pypi: + - "scprep" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/metrics/poisson/config.vsh.yaml" + runner: "executable" + engine: "docker" + output: "target/executable/metrics/poisson" + executable: "target/executable/metrics/poisson/poisson" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/executable/metrics/poisson/poisson b/target/executable/metrics/poisson/poisson new file mode 100755 index 0000000..2f17da9 --- /dev/null +++ b/target/executable/metrics/poisson/poisson @@ -0,0 +1,1133 @@ +#!/usr/bin/env bash + +# poisson 1.0.0 +# +# This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="poisson" +VIASH_META_FUNCTIONALITY_NAME="poisson" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "poisson 1.0.0" + echo "" + echo "Arguments:" + echo " --input_test" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + echo "" + echo " --input_prediction" + echo " type: file, required parameter, file must exist" + echo " example:" + echo "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" +} + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM openproblems/base_python:1.0.0 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scprep" "numpy<2" + +LABEL org.opencontainers.image.description="Companion container for running component metrics poisson" +LABEL org.opencontainers.image.created="2024-12-19T16:15:28Z" +LABEL org.opencontainers.image.source="https://github.com/openproblems-bio/task_denoising" +LABEL org.opencontainers.image.revision="252731bc7276eb8a6a3398dc4bea026ae70eca80" +LABEL org.opencontainers.image.version="1.0.0" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "poisson 1.0.0" + exit + ;; + --input_test) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_test. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_test=*) + [ -n "$VIASH_PAR_INPUT_TEST" ] && ViashError Bad arguments for option \'--input_test=*\': \'$VIASH_PAR_INPUT_TEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_TEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_prediction) + [ -n "$VIASH_PAR_INPUT_PREDICTION" ] && ViashError Bad arguments for option \'--input_prediction\': \'$VIASH_PAR_INPUT_PREDICTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_PREDICTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_prediction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_prediction=*) + [ -n "$VIASH_PAR_INPUT_PREDICTION" ] && ViashError Bad arguments for option \'--input_prediction=*\': \'$VIASH_PAR_INPUT_PREDICTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_PREDICTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='ghcr.io/openproblems-bio/task_denoising/metrics/poisson:1.0.0' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_TEST+x} ]; then + ViashError '--input_test' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_PREDICTION+x} ]; then + ViashError '--input_prediction' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_TEST" ] && [ ! -e "$VIASH_PAR_INPUT_TEST" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_TEST' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_PREDICTION" ] && [ ! -e "$VIASH_PAR_INPUT_PREDICTION" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_PREDICTION' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_TEST")" ) + VIASH_PAR_INPUT_TEST=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_TEST") +fi +if [ ! -z "$VIASH_PAR_INPUT_PREDICTION" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_PREDICTION")" ) + VIASH_PAR_INPUT_PREDICTION=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_PREDICTION") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-poisson-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import anndata as ad +import scprep +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_prediction': $( if [ ! -z ${VIASH_PAR_INPUT_PREDICTION+x} ]; then echo "r'${VIASH_PAR_INPUT_PREDICTION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load Data", flush=True) +input_denoised = ad.read_h5ad(par['input_prediction']) +input_test = ad.read_h5ad(par['input_test']) + +test_data = scprep.utils.toarray(input_test.layers["counts"]) +denoised_data = scprep.utils.toarray(input_denoised.layers["denoised"]) + +print("Compute metric value", flush=True) +# scaling +initial_sum = input_test.uns["train_sum"] +target_sum = test_data.sum() +denoised_data = denoised_data * target_sum / initial_sum + +# from molecular_cross_validation.mcv_sweep import poisson_nll_loss +# copied from: https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/mcv_sweep.py +def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float: + return (y_pred - y_true * np.log(y_pred + 1e-6)).mean() + +error = poisson_nll_loss(test_data, denoised_data) + +print("Store poisson value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_TEST" ]; then + VIASH_PAR_INPUT_TEST=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_TEST") + fi + if [ ! -z "$VIASH_PAR_INPUT_PREDICTION" ]; then + VIASH_PAR_INPUT_PREDICTION=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_PREDICTION") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/nextflow/control_methods/no_denoising/.config.vsh.yaml b/target/nextflow/control_methods/no_denoising/.config.vsh.yaml new file mode 100644 index 0000000..f311c58 --- /dev/null +++ b/target/nextflow/control_methods/no_denoising/.config.vsh.yaml @@ -0,0 +1,300 @@ +name: "no_denoising" +namespace: "control_methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "No Denoising" +summary: "negative control by copying train counts" +description: "This method serves as a negative control, where the denoised data is\ + \ a copy of the unaltered training data. This represents the scoring threshold if\ + \ denoising was not performed on the data." +test_resources: +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/baseline.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "control_method" + type_info: + label: "Control Method" + summary: "A control method." + description: "These components have the same interface as the regular methods\n\ + but also receive the solution object as input. It serves as a\nstarting point\ + \ to test the relative accuracy of new methods in\nthe task, and also as a quality\ + \ control for the metrics defined\nin the task.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + entrypoint: [] + cmd: null +build_info: + config: "src/control_methods/no_denoising/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/control_methods/no_denoising" + executable: "target/nextflow/control_methods/no_denoising/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/control_methods/no_denoising/main.nf b/target/nextflow/control_methods/no_denoising/main.nf new file mode 100644 index 0000000..e10b279 --- /dev/null +++ b/target/nextflow/control_methods/no_denoising/main.nf @@ -0,0 +1,3732 @@ +// no_denoising 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "no_denoising", + "namespace" : "control_methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "label" : "No Denoising", + "summary" : "negative control by copying train counts", + "description" : "This method serves as a negative control, where the denoised data is a copy of the unaltered training data. This represents the scoring threshold if denoising was not performed on the data.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "v1" : { + "path" : "openproblems/tasks/denoising/methods/baseline.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "preferred_normalization" : "counts", + "type" : "control_method", + "type_info" : { + "label" : "Control Method", + "summary" : "A control method.", + "description" : "These components have the same interface as the regular methods\nbut also receive the solution object as input. It serves as a\nstarting point to test the relative accuracy of new methods in\nthe task, and also as a quality control for the metrics defined\nin the task.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/" + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/control_methods/no_denoising/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/control_methods/no_denoising", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_train.layers['counts'] + +input_train.uns["method_id"] = meta['name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/control_methods/no_denoising", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/control_methods/no_denoising/nextflow.config b/target/nextflow/control_methods/no_denoising/nextflow.config new file mode 100644 index 0000000..5f1a7eb --- /dev/null +++ b/target/nextflow/control_methods/no_denoising/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'control_methods/no_denoising' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'This method serves as a negative control, where the denoised data is a copy of the unaltered training data. This represents the scoring threshold if denoising was not performed on the data.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/control_methods/perfect_denoising/.config.vsh.yaml b/target/nextflow/control_methods/perfect_denoising/.config.vsh.yaml new file mode 100644 index 0000000..a23e62b --- /dev/null +++ b/target/nextflow/control_methods/perfect_denoising/.config.vsh.yaml @@ -0,0 +1,300 @@ +name: "perfect_denoising" +namespace: "control_methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "Perfect Denoising" +summary: "Positive control by copying the test counts" +description: "This method serves as a positive control, where the test data is copied\ + \ 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised\ + \ as it will be compared to the test data in the metrics." +test_resources: +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/baseline.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "control_method" + type_info: + label: "Control Method" + summary: "A control method." + description: "These components have the same interface as the regular methods\n\ + but also receive the solution object as input. It serves as a\nstarting point\ + \ to test the relative accuracy of new methods in\nthe task, and also as a quality\ + \ control for the metrics defined\nin the task.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + entrypoint: [] + cmd: null +build_info: + config: "src/control_methods/perfect_denoising/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/control_methods/perfect_denoising" + executable: "target/nextflow/control_methods/perfect_denoising/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/control_methods/perfect_denoising/main.nf b/target/nextflow/control_methods/perfect_denoising/main.nf new file mode 100644 index 0000000..c0e19f6 --- /dev/null +++ b/target/nextflow/control_methods/perfect_denoising/main.nf @@ -0,0 +1,3733 @@ +// perfect_denoising 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "perfect_denoising", + "namespace" : "control_methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "label" : "Perfect Denoising", + "summary" : "Positive control by copying the test counts", + "description" : "This method serves as a positive control, where the test data is copied 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised as it will be compared to the test data in the metrics.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "v1" : { + "path" : "openproblems/tasks/denoising/methods/baseline.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "preferred_normalization" : "counts", + "type" : "control_method", + "type_info" : { + "label" : "Control Method", + "summary" : "A control method.", + "description" : "These components have the same interface as the regular methods\nbut also receive the solution object as input. It serves as a\nstarting point to test the relative accuracy of new methods in\nthe task, and also as a quality control for the metrics defined\nin the task.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/" + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/control_methods/perfect_denoising/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/control_methods/perfect_denoising", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_test.layers['counts'] + +input_train.uns["method_id"] = meta['name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/control_methods/perfect_denoising", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/control_methods/perfect_denoising/nextflow.config b/target/nextflow/control_methods/perfect_denoising/nextflow.config new file mode 100644 index 0000000..9d922b0 --- /dev/null +++ b/target/nextflow/control_methods/perfect_denoising/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'control_methods/perfect_denoising' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'This method serves as a positive control, where the test data is copied 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised as it will be compared to the test data in the metrics.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/data_processors/process_dataset/.config.vsh.yaml b/target/nextflow/data_processors/process_dataset/.config.vsh.yaml new file mode 100644 index 0000000..9dbeebf --- /dev/null +++ b/target/nextflow/data_processors/process_dataset/.config.vsh.yaml @@ -0,0 +1,364 @@ +name: "process_dataset" +namespace: "data_processors" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + label: "Common Dataset" + summary: "A subset of the common dataset." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + obs: + - type: "string" + name: "batch" + description: "Batch information" + required: false + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--method" + description: "The process method to assign train/test." + info: null + default: + - "mcv" + required: false + choices: + - "mcv" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--train_frac" + description: "The fraction the molecules need to be split to train dataset" + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "A seed for the subsampling." + info: null + example: + - 123 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_obs_limit" + description: "The maximum number of cells the dataset may have before subsampling\ + \ according to `obs.batch`." + info: null + default: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +description: "Split data using molecular cross-validation.\n\nSplits molecules into\ + \ two (potentially overlapping) groups using a fraction ratio.\nThese are output\ + \ as two separate AnnData objects.\n" +test_resources: +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "file" + path: "resources_test/common/cxg_immune_cell_atlas" + dest: "resources_test/common/cxg_immune_cell_atlas" +info: + type: "process_dataset" + type_info: + label: "Data processor" + summary: "A denoising dataset processor." + description: "A component for processing a Common Dataset into a task-specific\ + \ dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "midcpu" + - "midtime" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "numpy" + - "scipy" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/data_processors/process_dataset/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/data_processors/process_dataset" + executable: "target/nextflow/data_processors/process_dataset/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/data_processors/process_dataset/helper.py b/target/nextflow/data_processors/process_dataset/helper.py new file mode 100644 index 0000000..2044ed4 --- /dev/null +++ b/target/nextflow/data_processors/process_dataset/helper.py @@ -0,0 +1,55 @@ +# MIT License + +# Copyright (c) 2019 Chan Zuckerberg Biohub + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Copied from https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/util.py + + +from typing import Tuple + +import numpy as np + +def split_molecules( + umis: np.ndarray, + data_split: float, + overlap_factor: float = 0.0, + random_state: np.random.RandomState = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Splits molecules into two (potentially overlapping) groups. + :param umis: Array of molecules to split + :param data_split: Proportion of molecules to assign to the first group + :param overlap_factor: Overlap correction factor, if desired + :param random_state: For reproducible sampling + :return: umis_X and umis_Y, representing ``split`` and ``~(1 - split)`` counts + sampled from the input array + """ + if random_state is None: + random_state = np.random.RandomState() + + umis_X_disjoint = random_state.binomial(umis, data_split - overlap_factor) + umis_Y_disjoint = random_state.binomial( + umis - umis_X_disjoint, (1 - data_split) / (1 - data_split + overlap_factor) + ) + overlap_factor = umis - umis_X_disjoint - umis_Y_disjoint + umis_X = umis_X_disjoint + overlap_factor + umis_Y = umis_Y_disjoint + overlap_factor + + return umis_X, umis_Y \ No newline at end of file diff --git a/target/nextflow/data_processors/process_dataset/main.nf b/target/nextflow/data_processors/process_dataset/main.nf new file mode 100644 index 0000000..865acd3 --- /dev/null +++ b/target/nextflow/data_processors/process_dataset/main.nf @@ -0,0 +1,3897 @@ +// process_dataset 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "process_dataset", + "namespace" : "data_processors", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "label" : "Common Dataset", + "summary" : "A subset of the common dataset.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "obs" : [ + { + "type" : "string", + "name" : "batch", + "description" : "Batch information", + "required" : false + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--method", + "description" : "The process method to assign train/test.", + "default" : [ + "mcv" + ], + "required" : false, + "choices" : [ + "mcv" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--train_frac", + "description" : "The fraction the molecules need to be split to train dataset", + "default" : [ + 0.9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "A seed for the subsampling.", + "example" : [ + 123 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_obs_limit", + "description" : "The maximum number of cells the dataset may have before subsampling according to `obs.batch`.", + "default" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "helper.py" + } + ], + "description" : "Split data using molecular cross-validation.\n\nSplits molecules into two (potentially overlapping) groups using a fraction ratio.\nThese are output as two separate AnnData objects.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/common/cxg_immune_cell_atlas", + "dest" : "resources_test/common/cxg_immune_cell_atlas" + } + ], + "info" : { + "type" : "process_dataset", + "type_info" : { + "label" : "Data processor", + "summary" : "A denoising dataset processor.", + "description" : "A component for processing a Common Dataset into a task-specific dataset.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "midcpu", + "midtime" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "numpy", + "scipy" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/data_processors/process_dataset/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/data_processors/process_dataset", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import sys +import random +import anndata as ad +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_train': $( if [ ! -z ${VIASH_PAR_OUTPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_test': $( if [ ! -z ${VIASH_PAR_OUTPUT_TEST+x} ]; then echo "r'${VIASH_PAR_OUTPUT_TEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'method': $( if [ ! -z ${VIASH_PAR_METHOD+x} ]; then echo "r'${VIASH_PAR_METHOD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'train_frac': $( if [ ! -z ${VIASH_PAR_TRAIN_FRAC+x} ]; then echo "float(r'${VIASH_PAR_TRAIN_FRAC//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_obs_limit': $( if [ ! -z ${VIASH_PAR_N_OBS_LIMIT+x} ]; then echo "int(r'${VIASH_PAR_N_OBS_LIMIT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from helper import split_molecules + +# set random state +random_state = np.random.RandomState(par['seed']) + +print(">> Load Data", flush=True) +adata = ad.read_h5ad(par["input"]) + +# limit to max number of observations +adata_output = adata.copy() + +if "batch" in adata.obs: + print(f">> Subsampling observations by largest batch", flush=True) + batch_counts = adata.obs.groupby('batch').size() + sorted_batches = batch_counts.sort_values(ascending=False) + selected_batch = sorted_batches.index[0] + adata_output = adata[adata.obs["batch"]==selected_batch,:].copy() + +if adata_output.n_obs > par["n_obs_limit"]: + print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True) + print(f">> Setting seed to {par['seed']}", flush=True) + random.seed(par["seed"]) + obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs) + obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False) + adata_output = adata_output[obs_index].copy() + +# remove all layers except for counts +print(">> Remove all layers except for counts", flush=True) +for key in list(adata_output.layers.keys()): + if key != "counts": + del adata_output.layers[key] + +# round counts and convert to int +print(">> Round counts and convert to int", flush=True) +counts = np.array(adata_output.layers["counts"]).round().astype(int) + +print(">> process and split data", flush=True) +train_data, test_data = split_molecules( + counts.data, par["train_frac"], 0.0, random_state +) + +X_train = counts.copy() +X_test = counts.copy() +X_train.data = train_data +X_test.data = test_data +X_train.eliminate_zeros() +X_test.eliminate_zeros() + +# copy adata to train_set, test_set +print(">> Create AnnData output objects", flush=True) +train_uns_keys = ["dataset_id", "dataset_organism"] +output_train = ad.AnnData( + layers={"counts": X_train}, + obs=adata_output.obs[[]], + var=adata_output.var[[]], + uns={key: adata_output.uns[key] for key in train_uns_keys} +) +test_uns_keys = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +output_test = ad.AnnData( + layers={"counts": X_test}, + obs=adata_output.obs[[]], + var=adata_output.var[[]], + uns={key: adata_output.uns[key] for key in test_uns_keys} +) + +# add additional information for the train set +output_test.uns["train_sum"] = X_train.sum() + +# Remove no cells that do not have enough reads +print(">> Remove cells that do not have enough reads", flush=True) +is_missing = np.array(X_train.sum(axis=0) == 0) + +output_train = output_train[:, ~is_missing.flatten()] +output_test = output_test[:, ~is_missing.flatten()] + +print(">> Write to file", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/data_processors/process_dataset", + "tag" : "1.0.0" + }, + "label" : [ + "highmem", + "midcpu", + "midtime" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/data_processors/process_dataset/nextflow.config b/target/nextflow/data_processors/process_dataset/nextflow.config new file mode 100644 index 0000000..de872a8 --- /dev/null +++ b/target/nextflow/data_processors/process_dataset/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'data_processors/process_dataset' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'Split data using molecular cross-validation.\n\nSplits molecules into two (potentially overlapping) groups using a fraction ratio.\nThese are output as two separate AnnData objects.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/methods/alra/.config.vsh.yaml b/target/nextflow/methods/alra/.config.vsh.yaml new file mode 100644 index 0000000..257628e --- /dev/null +++ b/target/nextflow/methods/alra/.config.vsh.yaml @@ -0,0 +1,278 @@ +name: "alra" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--norm" + description: "Normalization method" + info: null + default: + - "log" + required: false + choices: + - "sqrt" + - "log" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +label: "ALRA" +summary: "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation,\ + \ thresholding by gene, and rescaling the matrix." +description: "Adaptively-thresholded Low Rank Approximation (ALRA). \n\nALRA is a\ + \ method for imputation of missing values in single cell RNA-sequencing data, \n\ + described in the preprint, \"Zero-preserving imputation of scRNA-seq data using\ + \ low-rank approximation\" \navailable [here](https://www.biorxiv.org/content/early/2018/08/22/397588).\ + \ Given a \nscRNA-seq expression matrix, ALRA first computes its rank-k approximation\ + \ using randomized SVD. \nNext, each row (gene) is thresholded by the magnitude\ + \ of the most negative value of that gene. \nFinally, the matrix is rescaled.\n" +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/alra.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1101/397588" +links: + repository: "https://github.com/KlugerLab/ALRA" + docker_registry: "ghcr.io" + documentation: "https://github.com/KlugerLab/ALRA/blob/master/README.md" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_r:1.0.0" + namespace_separator: "/" + setup: + - type: "r" + cran: + - "Matrix" + - "rsvd" + github: + - "KlugerLab/ALRA" + bioc_force_install: false + entrypoint: [] + cmd: null +build_info: + config: "src/methods/alra/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/methods/alra" + executable: "target/nextflow/methods/alra/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/methods/alra/main.nf b/target/nextflow/methods/alra/main.nf new file mode 100644 index 0000000..9f85161 --- /dev/null +++ b/target/nextflow/methods/alra/main.nf @@ -0,0 +1,3732 @@ +// alra 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "alra", + "namespace" : "methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--norm", + "description" : "Normalization method", + "default" : [ + "log" + ], + "required" : false, + "choices" : [ + "sqrt", + "log" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + } + ], + "label" : "ALRA", + "summary" : "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation, thresholding by gene, and rescaling the matrix.", + "description" : "Adaptively-thresholded Low Rank Approximation (ALRA). \n\nALRA is a method for imputation of missing values in single cell RNA-sequencing data, \ndescribed in the preprint, \\"Zero-preserving imputation of scRNA-seq data using low-rank approximation\\" \navailable [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given a \nscRNA-seq expression matrix, ALRA first computes its rank-k approximation using randomized SVD. \nNext, each row (gene) is thresholded by the magnitude of the most negative value of that gene. \nFinally, the matrix is rescaled.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "v1" : { + "path" : "openproblems/tasks/denoising/methods/alra.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "preferred_normalization" : "counts", + "type" : "method", + "type_info" : { + "label" : "Method", + "summary" : "A method.", + "description" : "A denoising method to remove noise (i.e. technical artifacts) from a dataset.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "references" : { + "doi" : [ + "10.1101/397588" + ] + }, + "links" : { + "repository" : "https://github.com/KlugerLab/ALRA", + "docker_registry" : "ghcr.io", + "documentation" : "https://github.com/KlugerLab/ALRA/blob/master/README.md" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_r:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "r", + "cran" : [ + "Matrix", + "rsvd" + ], + "github" : [ + "KlugerLab/ALRA" + ], + "bioc_force_install" : false + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/methods/alra/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/methods/alra", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +cat(">> Loading dependencies\\\\n") +library(anndata, warn.conflicts = FALSE) +library(ALRA, warn.conflicts = FALSE) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input_train" = $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_TRAIN" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "norm" = $( if [ ! -z ${VIASH_PAR_NORM+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_NORM" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +cat(">> Load input data\\\\n") +input_train <- read_h5ad(par\\$input_train) + +cat(">> Set normalization method\\\\n") +if (par\\$norm == "sqrt") { + norm_fn <- sqrt + denorm_fn <- function(x) x^2 +} else if (par\\$norm == "log") { + norm_fn <- log1p + denorm_fn <- expm1 +} else { + stop("Unknown normalization method: ", par\\$norm) +} + +cat(">> Normalize data\\\\n") +data <- as.matrix(input_train\\$layers[["counts"]]) +totalPerCell <- rowSums(data) +data <- sweep(data, 1, totalPerCell, "/") +data <- norm_fn(data) + +cat(">> Run ALRA\\\\n") +data <- alra(data)\\$A_norm_rank_k_cor_sc +data <- denorm_fn(data) +data <- sweep(data, 1, totalPerCell, "*") + +cat(">> Store output\\\\n") +output <- AnnData( + layers = list(denoised = data), + obs = input_train\\$obs[, c(), drop = FALSE], + var = input_train\\$var[, c(), drop = FALSE], + uns = list( + dataset_id = input_train\\$uns[["dataset_id"]], + method_id = meta\\$name + ) +) + +cat(">> Write output to file\\\\n") +output\\$write_h5ad(par\\$output, compression = "gzip") +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/methods/alra", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/methods/alra/nextflow.config b/target/nextflow/methods/alra/nextflow.config new file mode 100644 index 0000000..3f55903 --- /dev/null +++ b/target/nextflow/methods/alra/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'methods/alra' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'Adaptively-thresholded Low Rank Approximation (ALRA). \n\nALRA is a method for imputation of missing values in single cell RNA-sequencing data, \ndescribed in the preprint, "Zero-preserving imputation of scRNA-seq data using low-rank approximation" \navailable [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given a \nscRNA-seq expression matrix, ALRA first computes its rank-k approximation using randomized SVD. \nNext, each row (gene) is thresholded by the magnitude of the most negative value of that gene. \nFinally, the matrix is rescaled.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/methods/dca/.config.vsh.yaml b/target/nextflow/methods/dca/.config.vsh.yaml new file mode 100644 index 0000000..9cccc6b --- /dev/null +++ b/target/nextflow/methods/dca/.config.vsh.yaml @@ -0,0 +1,285 @@ +name: "dca" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--epochs" + description: "Number of total epochs in training" + info: null + default: + - 300 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "DCA" +summary: "A deep autoencoder with ZINB loss function to address the dropout effect\ + \ in count data" +description: "\"Deep Count Autoencoder\n\nRemoves the dropout effect by taking the\ + \ count structure, overdispersed nature and sparsity of the data into account \n\ + using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function.\"\ + \n" +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/dca.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1038/s41467-018-07931-2" +links: + repository: "https://github.com/theislab/dca" + docker_registry: "ghcr.io" + documentation: "https://github.com/theislab/dca#readme" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.9" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + github: + - "openproblems-bio/core#subdirectory=packages/python/openproblems" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.8.0" + - "scanpy" + - "pyyaml" + - "requests" + - "jsonschema" + - "git+https://github.com/scottgigante-immunai/dca.git@patch-1" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/dca/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/methods/dca" + executable: "target/nextflow/methods/dca/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/methods/dca/main.nf b/target/nextflow/methods/dca/main.nf new file mode 100644 index 0000000..aa9dce6 --- /dev/null +++ b/target/nextflow/methods/dca/main.nf @@ -0,0 +1,3724 @@ +// dca 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "dca", + "namespace" : "methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--epochs", + "description" : "Number of total epochs in training", + "default" : [ + 300 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "label" : "DCA", + "summary" : "A deep autoencoder with ZINB loss function to address the dropout effect in count data", + "description" : "\\"Deep Count Autoencoder\n\nRemoves the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account \nusing a deep autoencoder with zero-inflated negative binomial (ZINB) loss function.\\"\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "v1" : { + "path" : "openproblems/tasks/denoising/methods/dca.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "preferred_normalization" : "counts", + "type" : "method", + "type_info" : { + "label" : "Method", + "summary" : "A method.", + "description" : "A denoising method to remove noise (i.e. technical artifacts) from a dataset.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "references" : { + "doi" : [ + "10.1038/s41467-018-07931-2" + ] + }, + "links" : { + "repository" : "https://github.com/theislab/dca", + "docker_registry" : "ghcr.io", + "documentation" : "https://github.com/theislab/dca#readme" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.9", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "github" : [ + "openproblems-bio/core#subdirectory=packages/python/openproblems" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.8.0", + "scanpy", + "pyyaml", + "requests", + "jsonschema", + "git+https://github.com/scottgigante-immunai/dca.git@patch-1", + "numpy<2" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/methods/dca/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/methods/dca", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad +from dca.api import dca + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'epochs': $( if [ ! -z ${VIASH_PAR_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Remove unneeded data", flush=True) +output = ad.AnnData( + X=input_train.layers["counts"], + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Run DCA", flush=True) +dca(output, epochs=par["epochs"]) + +print("Move output to correct location", flush=True) +output.layers["denoised"] = output.X +del output.X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/methods/dca", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/methods/dca/nextflow.config b/target/nextflow/methods/dca/nextflow.config new file mode 100644 index 0000000..3caaac4 --- /dev/null +++ b/target/nextflow/methods/dca/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'methods/dca' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = '"Deep Count Autoencoder\n\nRemoves the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account \nusing a deep autoencoder with zero-inflated negative binomial (ZINB) loss function."\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/methods/knn_smoothing/.config.vsh.yaml b/target/nextflow/methods/knn_smoothing/.config.vsh.yaml new file mode 100644 index 0000000..1c43809 --- /dev/null +++ b/target/nextflow/methods/knn_smoothing/.config.vsh.yaml @@ -0,0 +1,268 @@ +name: "knn_smoothing" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "KNN Smoothing" +summary: "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing\ + \ the size of neighbourhoods for smoothing until a maximum k value is reached." +description: "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq\ + \ expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first\ + \ applies initial normalisation and smoothing. Then, a chosen number of principal\ + \ components is used to calculate Euclidean distances between cells. Minimally sized\ + \ neighbourhoods are initially determined from these Euclidean distances, and expression\ + \ profiles are shared between neighbouring cells. Then, the resultant smoothed matrix\ + \ is used as input to the next step of smoothing, where the size (k) of the considered\ + \ neighbourhoods is increased, leading to greater smoothing. This process continues\ + \ until a chosen maximum k value has been reached, at which point the iteratively\ + \ smoothed object is then optionally scaled to yield a final result." +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/knn_smoothing.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1101/217737" +links: + repository: "https://github.com/yanailab/knn-smoothing" + docker_registry: "ghcr.io" + documentation: "https://github.com/yanailab/knn-smoothing#readme" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "scipy" + github: + - "scottgigante-immunai/knn-smoothing@python_package" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/knn_smoothing/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/methods/knn_smoothing" + executable: "target/nextflow/methods/knn_smoothing/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/methods/knn_smoothing/main.nf b/target/nextflow/methods/knn_smoothing/main.nf new file mode 100644 index 0000000..40685f8 --- /dev/null +++ b/target/nextflow/methods/knn_smoothing/main.nf @@ -0,0 +1,3694 @@ +// knn_smoothing 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "knn_smoothing", + "namespace" : "methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "label" : "KNN Smoothing", + "summary" : "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing the size of neighbourhoods for smoothing until a maximum k value is reached.", + "description" : "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first applies initial normalisation and smoothing. Then, a chosen number of principal components is used to calculate Euclidean distances between cells. Minimally sized neighbourhoods are initially determined from these Euclidean distances, and expression profiles are shared between neighbouring cells. Then, the resultant smoothed matrix is used as input to the next step of smoothing, where the size (k) of the considered neighbourhoods is increased, leading to greater smoothing. This process continues until a chosen maximum k value has been reached, at which point the iteratively smoothed object is then optionally scaled to yield a final result.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "v1" : { + "path" : "openproblems/tasks/denoising/methods/knn_smoothing.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "preferred_normalization" : "counts", + "type" : "method", + "type_info" : { + "label" : "Method", + "summary" : "A method.", + "description" : "A denoising method to remove noise (i.e. technical artifacts) from a dataset.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "references" : { + "doi" : [ + "10.1101/217737" + ] + }, + "links" : { + "repository" : "https://github.com/yanailab/knn-smoothing", + "docker_registry" : "ghcr.io", + "documentation" : "https://github.com/yanailab/knn-smoothing#readme" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "scipy" + ], + "github" : [ + "scottgigante-immunai/knn-smoothing@python_package" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/methods/knn_smoothing/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/methods/knn_smoothing", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import knn_smooth +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par["input_train"]) + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"].astype(float).transpose().toarray() + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Run KNN smoothing", flush=True) +X = knn_smooth.knn_smoothing(X, k=10).transpose() + +print("Process data", flush=True) +output.layers["denoised"] = X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/methods/knn_smoothing", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/methods/knn_smoothing/nextflow.config b/target/nextflow/methods/knn_smoothing/nextflow.config new file mode 100644 index 0000000..c8636b2 --- /dev/null +++ b/target/nextflow/methods/knn_smoothing/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'methods/knn_smoothing' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first applies initial normalisation and smoothing. Then, a chosen number of principal components is used to calculate Euclidean distances between cells. Minimally sized neighbourhoods are initially determined from these Euclidean distances, and expression profiles are shared between neighbouring cells. Then, the resultant smoothed matrix is used as input to the next step of smoothing, where the size (k) of the considered neighbourhoods is increased, leading to greater smoothing. This process continues until a chosen maximum k value has been reached, at which point the iteratively smoothed object is then optionally scaled to yield a final result.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/methods/magic/.config.vsh.yaml b/target/nextflow/methods/magic/.config.vsh.yaml new file mode 100644 index 0000000..8eee16d --- /dev/null +++ b/target/nextflow/methods/magic/.config.vsh.yaml @@ -0,0 +1,323 @@ +name: "magic" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--solver" + description: "Which solver to use." + info: null + default: + - "exact" + required: false + choices: + - "exact" + - "approximate" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--norm" + description: "Normalization method" + info: null + default: + - "log" + required: false + choices: + - "sqrt" + - "log" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--decay" + description: "sets decay rate of kernel tails" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--t" + description: "power to which the diffusion operator is powered" + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "MAGIC" +summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone." +description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method\ + \ for imputation and denoising of noisy or dropout-prone single cell RNA-sequencing\ + \ data. Given a normalised scRNA-seq expression matrix, it first calculates Euclidean\ + \ distances between each pair of cells in the dataset, which is then augmented using\ + \ a Gaussian kernel (function) and row-normalised to give a normalised affinity\ + \ matrix. A t-step markov process is then calculated, by powering this affinity\ + \ matrix t times. Finally, the powered affinity matrix is right-multiplied by the\ + \ normalised data, causing the final imputed values to take the value of a per-gene\ + \ average weighted by the affinities of cells. The resultant imputed matrix is then\ + \ rescaled, to more closely match the magnitude of measurements in the normalised\ + \ (input) matrix." +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + v1: + path: "openproblems/tasks/denoising/methods/magic.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + variants: + magic_approx: + solver: "approximate" + magic_knn_naive: + norm: "log" + decay: "none" + t: 1 + preferred_normalization: "counts" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1016/j.cell.2018.05.061" +links: + repository: "https://github.com/KrishnaswamyLab/MAGIC" + docker_registry: "ghcr.io" + documentation: "https://github.com/KrishnaswamyLab/MAGIC#readme" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "scprep" + - "magic-impute" + - "scipy" + - "scikit-learn<1.2" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/magic/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/methods/magic" + executable: "target/nextflow/methods/magic/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/methods/magic/main.nf b/target/nextflow/methods/magic/main.nf new file mode 100644 index 0000000..c77cf38 --- /dev/null +++ b/target/nextflow/methods/magic/main.nf @@ -0,0 +1,3797 @@ +// magic 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "magic", + "namespace" : "methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--solver", + "description" : "Which solver to use.", + "default" : [ + "exact" + ], + "required" : false, + "choices" : [ + "exact", + "approximate" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--norm", + "description" : "Normalization method", + "default" : [ + "log" + ], + "required" : false, + "choices" : [ + "sqrt", + "log" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--decay", + "description" : "sets decay rate of kernel tails", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--t", + "description" : "power to which the diffusion operator is powered", + "default" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "label" : "MAGIC", + "summary" : "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone.", + "description" : "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for imputation and denoising of noisy or dropout-prone single cell RNA-sequencing data. Given a normalised scRNA-seq expression matrix, it first calculates Euclidean distances between each pair of cells in the dataset, which is then augmented using a Gaussian kernel (function) and row-normalised to give a normalised affinity matrix. A t-step markov process is then calculated, by powering this affinity matrix t times. Finally, the powered affinity matrix is right-multiplied by the normalised data, causing the final imputed values to take the value of a per-gene average weighted by the affinities of cells. The resultant imputed matrix is then rescaled, to more closely match the magnitude of measurements in the normalised (input) matrix.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "v1" : { + "path" : "openproblems/tasks/denoising/methods/magic.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "variants" : { + "magic_approx" : { + "solver" : "approximate" + }, + "magic_knn_naive" : { + "norm" : "log", + "decay" : "none", + "t" : 1 + } + }, + "preferred_normalization" : "counts", + "type" : "method", + "type_info" : { + "label" : "Method", + "summary" : "A method.", + "description" : "A denoising method to remove noise (i.e. technical artifacts) from a dataset.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "references" : { + "doi" : [ + "10.1016/j.cell.2018.05.061" + ] + }, + "links" : { + "repository" : "https://github.com/KrishnaswamyLab/MAGIC", + "docker_registry" : "ghcr.io", + "documentation" : "https://github.com/KrishnaswamyLab/MAGIC#readme" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pip" : [ + "scprep", + "magic-impute", + "scipy", + "scikit-learn<1.2", + "numpy<2" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/methods/magic/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/methods/magic", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad +import numpy as np +import scprep +from magic import MAGIC +import scipy + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'solver': $( if [ ! -z ${VIASH_PAR_SOLVER+x} ]; then echo "r'${VIASH_PAR_SOLVER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'norm': $( if [ ! -z ${VIASH_PAR_NORM+x} ]; then echo "r'${VIASH_PAR_NORM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'decay': $( if [ ! -z ${VIASH_PAR_DECAY+x} ]; then echo "int(r'${VIASH_PAR_DECAY//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 't': $( if [ ! -z ${VIASH_PAR_T+x} ]; then echo "int(r'${VIASH_PAR_T//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par["input_train"]) + +print("Set normalization method", flush=True) +if par["norm"] == "sqrt": + norm_fn = np.sqrt + denorm_fn = np.square +elif par["norm"] == "log": + norm_fn = np.log1p + denorm_fn = np.expm1 +else: + raise ValueError("Unknown normalization method: " + par["norm"] + ".") + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"] + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["name"] + } +) + +del input_train + +print("Normalize data", flush=True) +X, libsize = scprep.normalize.library_size_normalize( + X, + rescale=1, + return_library_size=True +) +X = scprep.utils.matrix_transform(X, norm_fn) + +print("Run MAGIC", flush=True) +magic = MAGIC( + solver=par["solver"], + decay=par["decay"], + t=par["t"], + verbose=False, +) +X = magic.fit_transform(X, genes="all_genes") + +print("Denormalizing data", flush=True) +X = scprep.utils.matrix_transform(X, denorm_fn) +X = scprep.utils.matrix_vector_elementwise_multiply(X, libsize, axis=0) + +print("Create output AnnData", flush=True) +output.layers["denoised"] = X + +print("Write Data", flush=True) +output.write_h5ad(par["output"], compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/methods/magic", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/methods/magic/nextflow.config b/target/nextflow/methods/magic/nextflow.config new file mode 100644 index 0000000..09b88eb --- /dev/null +++ b/target/nextflow/methods/magic/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'methods/magic' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for imputation and denoising of noisy or dropout-prone single cell RNA-sequencing data. Given a normalised scRNA-seq expression matrix, it first calculates Euclidean distances between each pair of cells in the dataset, which is then augmented using a Gaussian kernel (function) and row-normalised to give a normalised affinity matrix. A t-step markov process is then calculated, by powering this affinity matrix t times. Finally, the powered affinity matrix is right-multiplied by the normalised data, causing the final imputed values to take the value of a per-gene average weighted by the affinities of cells. The resultant imputed matrix is then rescaled, to more closely match the magnitude of measurements in the normalised (input) matrix.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/methods/scprint/.config.vsh.yaml b/target/nextflow/methods/scprint/.config.vsh.yaml new file mode 100644 index 0000000..fd62e74 --- /dev/null +++ b/target/nextflow/methods/scprint/.config.vsh.yaml @@ -0,0 +1,299 @@ +name: "scprint" +namespace: "methods" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--model_name" + description: "Which model to use. Not used if --model is provided." + info: null + default: + - "large" + required: false + choices: + - "large" + - "medium" + - "small" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model" + description: "Path to the scPRINT model." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +label: "scPRINT" +summary: "scPRINT is a large transformer model built for the inference of gene networks" +description: "scPRINT is a large transformer model built for the inference of gene\ + \ networks\n(connections between genes explaining the cell's expression profile)\ + \ from\nscRNAseq data.\n\nIt uses novel encoding and decoding of the cell expression\ + \ profile and new\npre-training methodologies to learn a cell model.\n\nscPRINT\ + \ can be used to perform the following analyses:\n\n- expression denoising: increase\ + \ the resolution of your scRNAseq data\n- cell embedding: generate a low-dimensional\ + \ representation of your dataset\n- label prediction: predict the cell type, disease,\ + \ sequencer, sex, and\n ethnicity of your cells\n- gene network inference: generate\ + \ a gene network from any cell or cell\n cluster in your scRNAseq dataset\n" +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +info: + preferred_normalization: "counts" + variants: + scprint_large: + model_name: "large" + scprint_medium: + model_name: "medium" + scprint_small: + model_name: "small" + type: "method" + type_info: + label: "Method" + summary: "A method." + description: "A denoising method to remove noise (i.e. technical artifacts) from\ + \ a dataset.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +references: + doi: + - "10.1101/2024.07.29.605556" +links: + repository: "https://github.com/cantinilab/scPRINT" + docker_registry: "ghcr.io" + documentation: "https://cantinilab.github.io/scPRINT/" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "midmem" + - "midcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_pytorch_nvidia:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "huggingface_hub" + - "scprint" + upgrade: true + - type: "docker" + run: + - "lamin init --storage ./main --name main --schema bionty" + - type: "python" + user: false + script: + - "import bionty as bt; bt.core.sync_all_sources_to_latest()" + upgrade: true + - type: "docker" + run: + - "lamin load anonymous/main" + - type: "python" + user: false + script: + - "from scdataloader.utils import populate_my_ontology; populate_my_ontology()" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/methods/scprint/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/methods/scprint" + executable: "target/nextflow/methods/scprint/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/methods/scprint/main.nf b/target/nextflow/methods/scprint/main.nf new file mode 100644 index 0000000..bed63c9 --- /dev/null +++ b/target/nextflow/methods/scprint/main.nf @@ -0,0 +1,3816 @@ +// scprint 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scprint", + "namespace" : "methods", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--model_name", + "description" : "Which model to use. Not used if --model is provided.", + "default" : [ + "large" + ], + "required" : false, + "choices" : [ + "large", + "medium", + "small" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model", + "description" : "Path to the scPRINT model.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "label" : "scPRINT", + "summary" : "scPRINT is a large transformer model built for the inference of gene networks", + "description" : "scPRINT is a large transformer model built for the inference of gene networks\n(connections between genes explaining the cell's expression profile) from\nscRNAseq data.\n\nIt uses novel encoding and decoding of the cell expression profile and new\npre-training methodologies to learn a cell model.\n\nscPRINT can be used to perform the following analyses:\n\n- expression denoising: increase the resolution of your scRNAseq data\n- cell embedding: generate a low-dimensional representation of your dataset\n- label prediction: predict the cell type, disease, sequencer, sex, and\n ethnicity of your cells\n- gene network inference: generate a gene network from any cell or cell\n cluster in your scRNAseq dataset\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + } + ], + "info" : { + "preferred_normalization" : "counts", + "variants" : { + "scprint_large" : { + "model_name" : "large" + }, + "scprint_medium" : { + "model_name" : "medium" + }, + "scprint_small" : { + "model_name" : "small" + } + }, + "type" : "method", + "type_info" : { + "label" : "Method", + "summary" : "A method.", + "description" : "A denoising method to remove noise (i.e. technical artifacts) from a dataset.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "references" : { + "doi" : [ + "10.1101/2024.07.29.605556" + ] + }, + "links" : { + "repository" : "https://github.com/cantinilab/scPRINT", + "docker_registry" : "ghcr.io", + "documentation" : "https://cantinilab.github.io/scPRINT/" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "midmem", + "midcpu", + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_pytorch_nvidia:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pip" : [ + "huggingface_hub", + "scprint" + ], + "upgrade" : true + }, + { + "type" : "docker", + "run" : [ + "lamin init --storage ./main --name main --schema bionty" + ] + }, + { + "type" : "python", + "user" : false, + "script" : [ + "import bionty as bt; bt.core.sync_all_sources_to_latest()" + ], + "upgrade" : true + }, + { + "type" : "docker", + "run" : [ + "lamin load anonymous/main" + ] + }, + { + "type" : "python", + "user" : false, + "script" : [ + "from scdataloader.utils import populate_my_ontology; populate_my_ontology()" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/methods/scprint/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/methods/scprint", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import os + +import anndata as ad +import scprint +import torch +from huggingface_hub import hf_hub_download +from scdataloader import Preprocessor +from scprint import scPrint +from scprint.tasks import Denoiser +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_train': $( if [ ! -z ${VIASH_PAR_INPUT_TRAIN+x} ]; then echo "r'${VIASH_PAR_INPUT_TRAIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_name': $( if [ ! -z ${VIASH_PAR_MODEL_NAME+x} ]; then echo "r'${VIASH_PAR_MODEL_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print(f"====== scPRINT version {scprint.__version__} ======", flush=True) + +print("\\\\n>>> Reading input data...", flush=True) +input = ad.read_h5ad(par["input_train"]) +print(input) + +print("\\\\n>>> Preprocessing data...", flush=True) +adata = ad.AnnData( + X=input.layers["counts"] +) +adata.obs_names = input.obs_names +adata.var_names = input.var_names +if input.uns["dataset_organism"] == "homo_sapiens": + adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606" +elif input.uns["dataset_organism"] == "mus_musculus": + adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090" +else: + raise ValueError( + f"scPRINT requires human or mouse data, not '{input.uns['dataset_organism']}'" + ) + +preprocessor = Preprocessor( + # Lower this threshold for test datasets + min_valid_genes_id=1000 if input.n_vars < 2000 else 10000, + # Turn off cell filtering to return results for all cells + filter_cell_by_counts=False, + min_nnz_genes=False, + do_postp=False, + # Skip ontology checks + skip_validate=True, +) +adata = preprocessor(adata) +print(adata) + +model_checkpoint_file = par["model"] +if model_checkpoint_file is None: + print(f"\\\\n>>> Downloading '{par['model_name']}' model...", flush=True) + model_checkpoint_file = hf_hub_download( + repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt" + ) +print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True) +model = scPrint.load_from_checkpoint( + model_checkpoint_file, + transformer="normal", # Don't use this for GPUs with flashattention + precpt_gene_emb=None, +) + +print("\\\\n>>> Denoising data...", flush=True) +if torch.cuda.is_available(): + print("CUDA is available, using GPU", flush=True) + precision = "16-mixed" + dtype = torch.float16 +else: + print("CUDA is not available, using CPU", flush=True) + precision = "32" + dtype = torch.float32 +n_cores_available = len(os.sched_getaffinity(0)) +print(f"Using {n_cores_available} worker cores") +denoiser = Denoiser( + num_workers=n_cores_available, + precision=precision, + max_cells=adata.n_obs, + doplot=False, + dtype=dtype, +) +_, idxs, genes, expr_pred = denoiser(model, adata) +print(f"Predicted expression dimensions: {expr_pred.shape}") + +print("\\\\n>>> Applying denoising...", flush=True) +adata.X = adata.X.tolil() +idxs = idxs if idxs is not None else range(adata.shape[0]) +for i, idx in enumerate(idxs): + adata.X[idx, adata.var.index.get_indexer(genes)] = expr_pred[i] +adata.X = adata.X.tocsr() +print(adata) + +print("\\\\n>>> Storing output...", flush=True) +output = ad.AnnData( + layers={ + "denoised": adata.X[:, adata.var.index.get_indexer(input.var_names)], + }, + obs=input.obs[[]], + var=input.var[[]], + uns={ + "dataset_id": input.uns["dataset_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\\\\n>>> Writing output AnnData to file...", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\\\\n>>> Done!", flush=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/methods/scprint", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "midmem", + "midcpu", + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/methods/scprint/nextflow.config b/target/nextflow/methods/scprint/nextflow.config new file mode 100644 index 0000000..aacb9a2 --- /dev/null +++ b/target/nextflow/methods/scprint/nextflow.config @@ -0,0 +1,87 @@ +manifest { + name = 'methods/scprint' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' + description = 'scPRINT is a large transformer model built for the inference of gene networks\n(connections between genes explaining the cell\'s expression profile) from\nscRNAseq data.\n\nIt uses novel encoding and decoding of the cell expression profile and new\npre-training methodologies to learn a cell model.\n\nscPRINT can be used to perform the following analyses:\n\n- expression denoising: increase the resolution of your scRNAseq data\n- cell embedding: generate a low-dimensional representation of your dataset\n- label prediction: predict the cell type, disease, sequencer, sex, and\n ethnicity of your cells\n- gene network inference: generate a gene network from any cell or cell\n cluster in your scRNAseq dataset\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/metrics/mse/.config.vsh.yaml b/target/nextflow/metrics/mse/.config.vsh.yaml new file mode 100644 index 0000000..b177ee4 --- /dev/null +++ b/target/nextflow/metrics/mse/.config.vsh.yaml @@ -0,0 +1,317 @@ +name: "mse" +namespace: "metrics" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_prediction" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Score" + summary: "File indicating the score of a metric." + info: + format: + type: "h5ad" + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + - type: "string" + name: "metric_ids" + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: "double" + name: "metric_values" + description: "The metric values obtained for the given prediction. Must\ + \ be of same length as 'metric_ids'." + multiple: true + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + metrics: + - name: "mse" + label: "Mean-squared error" + summary: "The mean squared error between the denoised counts and the true counts." + description: "The mean squared error between the denoised counts of the training\ + \ dataset and the true counts of the test dataset after reweighing by the train/test\ + \ ratio" + references: + doi: "10.1101/786269" + v1: + path: "openproblems/tasks/denoising/metrics/mse.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + maximize: false + min: 0 + max: "+.inf" + type: "metric" + type_info: + label: "Metric" + summary: "A metric." + description: "A metric for evaluating denoised datasets.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pypi: + - "scikit-learn" + - "scprep" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/metrics/mse/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/metrics/mse" + executable: "target/nextflow/metrics/mse/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/metrics/mse/main.nf b/target/nextflow/metrics/mse/main.nf new file mode 100644 index 0000000..93a5921 --- /dev/null +++ b/target/nextflow/metrics/mse/main.nf @@ -0,0 +1,3787 @@ +// mse 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "mse", + "namespace" : "metrics", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_prediction", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Score", + "summary" : "File indicating the score of a metric.", + "info" : { + "format" : { + "type" : "h5ad", + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + }, + { + "type" : "string", + "name" : "metric_ids", + "description" : "One or more unique metric identifiers", + "multiple" : true, + "required" : true + }, + { + "type" : "double", + "name" : "metric_values", + "description" : "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'.", + "multiple" : true, + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "metrics" : [ + { + "name" : "mse", + "label" : "Mean-squared error", + "summary" : "The mean squared error between the denoised counts and the true counts.", + "description" : "The mean squared error between the denoised counts of the training dataset and the true counts of the test dataset after reweighing by the train/test ratio", + "references" : { + "doi" : "10.1101/786269" + }, + "v1" : { + "path" : "openproblems/tasks/denoising/metrics/mse.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "maximize" : false, + "min" : 0, + "max" : "+.inf" + } + ], + "type" : "metric", + "type_info" : { + "label" : "Metric", + "summary" : "A metric.", + "description" : "A metric for evaluating denoised datasets.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "highmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pypi" : [ + "scikit-learn", + "scprep", + "numpy<2" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/metrics/mse/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/metrics/mse", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad +import scanpy as sc +import sklearn.metrics +import scprep + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_prediction': $( if [ ! -z ${VIASH_PAR_INPUT_PREDICTION+x} ]; then echo "r'${VIASH_PAR_INPUT_PREDICTION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load data", flush=True) +input_denoised = ad.read_h5ad(par['input_prediction']) +input_test = ad.read_h5ad(par['input_test']) + +test_data = ad.AnnData(X=input_test.layers["counts"]) +denoised_data = ad.AnnData(X=input_denoised.layers["denoised"]) + +print("Normalize data", flush=True) + +# scaling and transformation +target_sum = 10000 + +sc.pp.normalize_total(test_data, target_sum=target_sum) +sc.pp.log1p(test_data) + +sc.pp.normalize_total(denoised_data, target_sum=target_sum) +sc.pp.log1p(denoised_data) + +print("Compute mse value", flush=True) +error = sklearn.metrics.mean_squared_error( + scprep.utils.toarray(test_data.X), scprep.utils.toarray(denoised_data.X) +) + +print("Store mse value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/metrics/mse", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "highmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metrics/mse/nextflow.config b/target/nextflow/metrics/mse/nextflow.config new file mode 100644 index 0000000..64e566f --- /dev/null +++ b/target/nextflow/metrics/mse/nextflow.config @@ -0,0 +1,86 @@ +manifest { + name = 'metrics/mse' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/metrics/poisson/.config.vsh.yaml b/target/nextflow/metrics/poisson/.config.vsh.yaml new file mode 100644 index 0000000..41c1f51 --- /dev/null +++ b/target/nextflow/metrics/poisson/.config.vsh.yaml @@ -0,0 +1,316 @@ +name: "poisson" +namespace: "metrics" +version: "1.0.0" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_prediction" + label: "Denoised data" + summary: "A denoised dataset as output by a method." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "denoised" + description: "denoised data" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + label: "Score" + summary: "File indicating the score of a metric." + info: + format: + type: "h5ad" + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - type: "string" + name: "method_id" + description: "A unique identifier for the method" + required: true + - type: "string" + name: "metric_ids" + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: "double" + name: "metric_values" + description: "The metric values obtained for the given prediction. Must\ + \ be of same length as 'metric_ids'." + multiple: true + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +test_resources: +- type: "python_script" + path: "check_config.py" + is_executable: true +- type: "python_script" + path: "run_and_check_output.py" + is_executable: true +- type: "file" + path: "library.bib" +- type: "file" + path: "resources_test/task_denoising/cxg_immune_cell_atlas" + dest: "resources_test/task_denoising/cxg_immune_cell_atlas" +info: + metrics: + - name: "poisson" + label: "Poisson Loss" + summary: "The Poisson log likelihood of the true counts observed in the distribution\ + \ of denoised counts" + description: "The Poisson log likelihood of observing the true counts of the test\ + \ dataset given the distribution given in the denoised dataset." + references: + doi: "10.1101/786269" + v1: + path: "openproblems/tasks/denoising/metrics/poisson.py" + commit: "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + maximize: false + min: 0 + max: "+.inf" + type: "metric" + type_info: + label: "Metric" + summary: "A metric." + description: "A metric for evaluating denoised datasets.\n" +status: "enabled" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midtime" + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "openproblems/base_python:1.0.0" + namespace_separator: "/" + setup: + - type: "python" + user: false + pypi: + - "scprep" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +build_info: + config: "src/metrics/poisson/config.vsh.yaml" + runner: "nextflow" + engine: "docker" + output: "target/nextflow/metrics/poisson" + executable: "target/nextflow/metrics/poisson/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/metrics/poisson/main.nf b/target/nextflow/metrics/poisson/main.nf new file mode 100644 index 0000000..714afdd --- /dev/null +++ b/target/nextflow/metrics/poisson/main.nf @@ -0,0 +1,3782 @@ +// poisson 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "poisson", + "namespace" : "metrics", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_prediction", + "label" : "Denoised data", + "summary" : "A denoised dataset as output by a method.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "denoised", + "description" : "denoised data", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/denoised.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "label" : "Score", + "summary" : "File indicating the score of a metric.", + "info" : { + "format" : { + "type" : "h5ad", + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "type" : "string", + "name" : "method_id", + "description" : "A unique identifier for the method", + "required" : true + }, + { + "type" : "string", + "name" : "metric_ids", + "description" : "One or more unique metric identifiers", + "multiple" : true, + "required" : true + }, + { + "type" : "double", + "name" : "metric_values", + "description" : "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'.", + "multiple" : true, + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/score.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + } + ], + "test_resources" : [ + { + "type" : "python_script", + "path" : "/common/component_tests/check_config.py", + "is_executable" : true + }, + { + "type" : "python_script", + "path" : "/common/component_tests/run_and_check_output.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/common/library.bib" + }, + { + "type" : "file", + "path" : "/resources_test/task_denoising/cxg_immune_cell_atlas", + "dest" : "resources_test/task_denoising/cxg_immune_cell_atlas" + } + ], + "info" : { + "metrics" : [ + { + "name" : "poisson", + "label" : "Poisson Loss", + "summary" : "The Poisson log likelihood of the true counts observed in the distribution of denoised counts", + "description" : "The Poisson log likelihood of observing the true counts of the test dataset given the distribution given in the denoised dataset.", + "references" : { + "doi" : "10.1101/786269" + }, + "v1" : { + "path" : "openproblems/tasks/denoising/metrics/poisson.py", + "commit" : "b3456fd73c04c28516f6df34c57e6e3e8b0dab32" + }, + "maximize" : false, + "min" : 0, + "max" : "+.inf" + } + ], + "type" : "metric", + "type_info" : { + "label" : "Metric", + "summary" : "A metric.", + "description" : "A metric for evaluating denoised datasets.\n" + } + }, + "status" : "enabled", + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midtime", + "highmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "openproblems/base_python:1.0.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pypi" : [ + "scprep", + "numpy<2" + ], + "upgrade" : true + } + ] + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/metrics/poisson/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker", + "output" : "target/nextflow/metrics/poisson", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +import anndata as ad +import scprep +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_test': $( if [ ! -z ${VIASH_PAR_INPUT_TEST+x} ]; then echo "r'${VIASH_PAR_INPUT_TEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_prediction': $( if [ ! -z ${VIASH_PAR_INPUT_PREDICTION+x} ]; then echo "r'${VIASH_PAR_INPUT_PREDICTION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Load Data", flush=True) +input_denoised = ad.read_h5ad(par['input_prediction']) +input_test = ad.read_h5ad(par['input_test']) + +test_data = scprep.utils.toarray(input_test.layers["counts"]) +denoised_data = scprep.utils.toarray(input_denoised.layers["denoised"]) + +print("Compute metric value", flush=True) +# scaling +initial_sum = input_test.uns["train_sum"] +target_sum = test_data.sum() +denoised_data = denoised_data * target_sum / initial_sum + +# from molecular_cross_validation.mcv_sweep import poisson_nll_loss +# copied from: https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/mcv_sweep.py +def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float: + return (y_pred - y_true * np.log(y_pred + 1e-6)).mean() + +error = poisson_nll_loss(test_data, denoised_data) + +print("Store poisson value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = new nextflow.script.ScriptParser(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "ghcr.io", + "image" : "openproblems-bio/task_denoising/metrics/poisson", + "tag" : "1.0.0" + }, + "label" : [ + "midtime", + "highmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metrics/poisson/nextflow.config b/target/nextflow/metrics/poisson/nextflow.config new file mode 100644 index 0000000..3723190 --- /dev/null +++ b/target/nextflow/metrics/poisson/nextflow.config @@ -0,0 +1,86 @@ +manifest { + name = 'metrics/poisson' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/workflows/process_datasets/.config.vsh.yaml b/target/nextflow/workflows/process_datasets/.config.vsh.yaml new file mode 100644 index 0000000..712e14b --- /dev/null +++ b/target/nextflow/workflows/process_datasets/.config.vsh.yaml @@ -0,0 +1,300 @@ +name: "process_datasets" +namespace: "workflows" +version: "1.0.0" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + label: "Common Dataset" + summary: "A subset of the common dataset." + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + obs: + - type: "string" + name: "batch" + description: "Batch information" + required: false + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "dataset.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "helper.nf" +info: null +status: "enabled" +dependencies: +- name: "schema/verify_data_structure" + repository: + type: "github" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +- name: "data_processors/process_dataset" + repository: + type: "local" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +build_info: + config: "src/workflows/process_datasets/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/process_datasets" + executable: "target/nextflow/workflows/process_datasets/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" + dependencies: + - "target/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure" + - "target/nextflow/data_processors/process_dataset" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/workflows/process_datasets/helper.nf b/target/nextflow/workflows/process_datasets/helper.nf new file mode 100644 index 0000000..e05fc9d --- /dev/null +++ b/target/nextflow/workflows/process_datasets/helper.nf @@ -0,0 +1,31 @@ +Map findArgumentSchema(Map config, String argument_id) { + def argument_groups = + (config.argument_groups ?: []) + + [ + arguments: config.arguments ?: [] + ] + + def schema_value = argument_groups.findResult{ gr -> + gr.arguments.find { arg -> + arg.name == ("--" + argument_id) + } + } + return schema_value +} + +Boolean checkItemAllowed(String item, List include, List exclude, String includeArgName, String excludeArgName) { + + // Throw an error if both include and exclude lists are provided + if (include != null && exclude != null) { + throw new Exception("Cannot define both ${includeArgName} and ${excludeArgName}") + } + + if (include) { + return include.contains(item) + } + if (exclude) { + return !exclude.contains(item) + } + + return true +} diff --git a/target/nextflow/workflows/process_datasets/main.nf b/target/nextflow/workflows/process_datasets/main.nf new file mode 100644 index 0000000..af420ef --- /dev/null +++ b/target/nextflow/workflows/process_datasets/main.nf @@ -0,0 +1,3403 @@ +// process_datasets 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "process_datasets", + "namespace" : "workflows", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "label" : "Common Dataset", + "summary" : "A subset of the common dataset.", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "obs" : [ + { + "type" : "string", + "name" : "batch", + "description" : "Batch information", + "required" : false + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "dataset.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/common/nextflow_helpers/helper.nf" + } + ], + "status" : "enabled", + "dependencies" : [ + { + "name" : "schema/verify_data_structure", + "repository" : { + "type" : "github", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + }, + { + "name" : "data_processors/process_dataset", + "repository" : { + "type" : "local" + } + } + ], + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/workflows/process_datasets/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "target/nextflow/workflows/process_datasets", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { verify_data_structure } from "${meta.root_dir}/dependencies/github/openproblems-bio/core/build/main/nextflow/schema/verify_data_structure/main.nf" +include { process_dataset } from "${meta.resources_dir}/../../../nextflow/data_processors/process_dataset/main.nf" + +// inner workflow +// user-provided Nextflow code +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | verify_data_structure.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/process_datasets/nextflow.config b/target/nextflow/workflows/process_datasets/nextflow.config new file mode 100644 index 0000000..454707d --- /dev/null +++ b/target/nextflow/workflows/process_datasets/nextflow.config @@ -0,0 +1,86 @@ +manifest { + name = 'workflows/process_datasets' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/target/nextflow/workflows/run_benchmark/.config.vsh.yaml b/target/nextflow/workflows/run_benchmark/.config.vsh.yaml new file mode 100644 index 0000000..8ce0407 --- /dev/null +++ b/target/nextflow/workflows/run_benchmark/.config.vsh.yaml @@ -0,0 +1,361 @@ +name: "run_benchmark" +namespace: "workflows" +version: "1.0.0" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input_train" + label: "Training data" + summary: "The subset of molecules used for the training dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_test" + label: "Test data" + summary: "The subset of molecules used for the test dataset" + info: + format: + type: "h5ad" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + uns: + - type: "string" + name: "dataset_id" + description: "A unique identifier for the dataset" + required: true + - name: "dataset_name" + type: "string" + description: "Nicely formatted name." + required: true + - type: "string" + name: "dataset_url" + description: "Link to the original source of the dataset." + required: false + - name: "dataset_reference" + type: "string" + description: "Bibtex reference of the paper in which the dataset was published." + required: false + - name: "dataset_summary" + type: "string" + description: "Short description of the dataset." + required: true + - name: "dataset_description" + type: "string" + description: "Long description of the dataset." + required: true + - name: "dataset_organism" + type: "string" + description: "The organism of the sample in the dataset." + required: false + - name: "train_sum" + type: "integer" + description: "The total number of counts in the training dataset." + required: true + example: + - "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output_scores" + description: "A yaml file containing the scores of each of the methods" + info: null + default: + - "score_uns.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_method_configs" + info: null + default: + - "method_configs.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_metric_configs" + info: null + default: + - "metric_configs.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_dataset_info" + info: null + default: + - "dataset_uns.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_task_info" + info: null + default: + - "task_info.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Method filtering" + description: "Use these arguments to filter methods by name. By default, all methods\ + \ are\nrun. If `--methods_include` is defined, only those methods are run. If\n\ + `--methods_exclude` is defined, all methods except those specified are run.\n\ + These arguments are mutually exclusive, so only `--methods_include` OR\n`--methods_exclude`\ + \ can set but not both.\n" + arguments: + - type: "string" + name: "--methods_include" + description: "A list of method ids to include. If specified, only these methods\ + \ will be run.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--methods_exclude" + description: "A list of method ids to exclude. If specified, all methods except\ + \ the ones listed will be run.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "_viash.yaml" +- type: "file" + path: "helper.nf" +info: null +status: "enabled" +dependencies: +- name: "h5ad/extract_uns_metadata" + repository: + type: "github" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +- name: "control_methods/no_denoising" + repository: + type: "local" +- name: "control_methods/perfect_denoising" + repository: + type: "local" +- name: "methods/alra" + repository: + type: "local" +- name: "methods/dca" + repository: + type: "local" +- name: "methods/knn_smoothing" + repository: + type: "local" +- name: "methods/magic" + repository: + type: "local" +- name: "methods/scprint" + repository: + type: "local" +- name: "metrics/mse" + repository: + type: "local" +- name: "metrics/poisson" + repository: + type: "local" +repositories: +- type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" +license: "MIT" +links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + lowmem: "memory = 20.Gb" + midmem: "memory = 50.Gb" + highmem: "memory = 100.Gb" + lowcpu: "cpus = 5" + midcpu: "cpus = 15" + highcpu: "cpus = 30" + lowtime: "time = 1.h" + midtime: "time = 4.h" + hightime: "time = 8.h" + veryhightime: "time = 24.h" + debug: false + container: "docker" +build_info: + config: "src/workflows/run_benchmark/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/run_benchmark" + executable: "target/nextflow/workflows/run_benchmark/main.nf" + viash_version: "0.9.0" + git_commit: "252731bc7276eb8a6a3398dc4bea026ae70eca80" + git_remote: "https://github.com/openproblems-bio/task_denoising" + dependencies: + - "target/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata" + - "target/nextflow/control_methods/no_denoising" + - "target/nextflow/control_methods/perfect_denoising" + - "target/nextflow/methods/alra" + - "target/nextflow/methods/dca" + - "target/nextflow/methods/knn_smoothing" + - "target/nextflow/methods/magic" + - "target/nextflow/methods/scprint" + - "target/nextflow/metrics/mse" + - "target/nextflow/metrics/poisson" +package_config: + name: "task_denoising" + version: "1.0.0" + label: "Denoising" + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + description: "A key challenge in evaluating denoising methods is the general lack\ + \ of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\n\ + relied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)),\ + \ and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers\ + \ from specific limitations, it is\ndifficult to combine these different approaches\ + \ into a single quantitative measure of\ndenoising accuracy. Here, we instead\ + \ rely on an approach termed molecular\ncross-validation (MCV), which was specifically\ + \ developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson\ + \ et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the\ + \ observed molecules\nin a given scRNA-Seq dataset are first partitioned between\ + \ a *training* and a *test*\ndataset. Next, a denoising method is applied to the\ + \ training dataset. Finally, denoising\naccuracy is measured by comparing the\ + \ result to the test dataset. The authors show that\nboth in theory and in practice,\ + \ the measured denoising accuracy is representative of the\naccuracy that would\ + \ be obtained on a ground truth dataset.\n" + info: + image: "thumbnail.svg" + motivation: "Single-cell RNA-Seq protocols only detect a fraction of the mRNA\ + \ molecules present\nin each cell. As a result, the measurements (UMI counts)\ + \ observed for each gene and each\ncell are associated with generally high levels\ + \ of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)).\ + \ Denoising describes the task of\nestimating the true expression level of each\ + \ gene in each cell. In the single-cell\nliterature, this task is also referred\ + \ to as *imputation*, a term which is typically\nused for missing data problems\ + \ in statistics. Similar to the use of the terms \"dropout\",\n\"missing data\"\ + , and \"technical zeros\", this terminology can create confusion about the\n\ + underlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n" + test_resources: + - type: "s3" + path: "s3://openproblems-data/resources_test/task_denoising/" + dest: "resources_test/task_denoising" + - type: "s3" + path: "s3://openproblems-data/resources_test/common/" + dest: "resources_test/common" + repositories: + - type: "github" + name: "core" + repo: "openproblems-bio/core" + tag: "build/main" + path: "viash/core" + viash_version: "0.9.0" + source: "src" + target: "target" + config_mods: + - ".runners[.type == \"nextflow\"].config.labels := { lowmem : \"memory = 20.Gb\"\ + , midmem : \"memory = 50.Gb\", highmem : \"memory = 100.Gb\", lowcpu : \"cpus\ + \ = 5\", midcpu : \"cpus = 15\", highcpu : \"cpus = 30\", lowtime : \"time = 1.h\"\ + , midtime : \"time = 4.h\", hightime : \"time = 8.h\", veryhightime : \"time =\ + \ 24.h\" }" + authors: + - name: "Wesley Lewis" + roles: + - "author" + - "maintainer" + info: + github: "wes-lewis" + - name: "Scott Gigante" + roles: + - "author" + - "maintainer" + info: + github: "scottgigante" + orcid: "0000-0002-4544-2764" + - name: "Robrecht Cannoodt" + roles: + - "author" + info: + github: "rcannood" + orcid: "0000-0003-3641-729X" + - name: "Kai Waldrant" + roles: + - "contributor" + info: + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + keywords: + - "single-cell" + - "openproblems" + - "benchmark" + - "denoising" + license: "MIT" + organization: "openproblems-bio" + links: + repository: "https://github.com/openproblems-bio/task_denoising" + docker_registry: "ghcr.io" + issue_tracker: "https://github.com/openproblems-bio/task_denoising/issues" diff --git a/target/nextflow/workflows/run_benchmark/_viash.yaml b/target/nextflow/workflows/run_benchmark/_viash.yaml new file mode 100644 index 0000000..9466d58 --- /dev/null +++ b/target/nextflow/workflows/run_benchmark/_viash.yaml @@ -0,0 +1,77 @@ +name: task_denoising +organization: openproblems-bio +version: 1.0.0 +license: MIT +label: Denoising +keywords: [single-cell, openproblems, benchmark, denoising] +summary: "Removing noise in sparse single-cell RNA-sequencing count data" +description: | + A key challenge in evaluating denoising methods is the general lack of a ground truth. A + recent benchmark study ([Hou et al., + 2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) + relied on flow-sorted datasets, mixture control experiments ([Tian et al., + 2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk + RNA-Seq data. Since each of these approaches suffers from specific limitations, it is + difficult to combine these different approaches into a single quantitative measure of + denoising accuracy. Here, we instead rely on an approach termed molecular + cross-validation (MCV), which was specifically developed to quantify denoising accuracy + in the absence of a ground truth ([Batson et al., + 2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules + in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* + dataset. Next, a denoising method is applied to the training dataset. Finally, denoising + accuracy is measured by comparing the result to the test dataset. The authors show that + both in theory and in practice, the measured denoising accuracy is representative of the + accuracy that would be obtained on a ground truth dataset. +links: + issue_tracker: https://github.com/openproblems-bio/task_denoising/issues + repository: https://github.com/openproblems-bio/task_denoising + docker_registry: ghcr.io +info: + image: thumbnail.svg + motivation: | + Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present + in each cell. As a result, the measurements (UMI counts) observed for each gene and each + cell are associated with generally high levels of technical noise ([Grün et al., + 2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of + estimating the true expression level of each gene in each cell. In the single-cell + literature, this task is also referred to as *imputation*, a term which is typically + used for missing data problems in statistics. Similar to the use of the terms "dropout", + "missing data", and "technical zeros", this terminology can create confusion about the + underlying measurement process ([Sarkar and Stephens, + 2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/task_denoising/ + dest: resources_test/task_denoising + - type: s3 + path: s3://openproblems-data/resources_test/common/ + dest: resources_test/common +authors: + - name: "Wesley Lewis" + roles: [author, maintainer] + info: + github: wes-lewis + - name: "Scott Gigante" + roles: [author, maintainer] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Robrecht Cannoodt + roles: [author] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Kai Waldrant + roles: [contributor] + info: + github: KaiWaldrant + orcid: "0009-0003-8555-1361" +repositories: + - name: core + type: github + repo: openproblems-bio/core + tag: build/main + path: viash/core +viash_version: 0.9.0 +config_mods: | + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } diff --git a/target/nextflow/workflows/run_benchmark/helper.nf b/target/nextflow/workflows/run_benchmark/helper.nf new file mode 100644 index 0000000..e05fc9d --- /dev/null +++ b/target/nextflow/workflows/run_benchmark/helper.nf @@ -0,0 +1,31 @@ +Map findArgumentSchema(Map config, String argument_id) { + def argument_groups = + (config.argument_groups ?: []) + + [ + arguments: config.arguments ?: [] + ] + + def schema_value = argument_groups.findResult{ gr -> + gr.arguments.find { arg -> + arg.name == ("--" + argument_id) + } + } + return schema_value +} + +Boolean checkItemAllowed(String item, List include, List exclude, String includeArgName, String excludeArgName) { + + // Throw an error if both include and exclude lists are provided + if (include != null && exclude != null) { + throw new Exception("Cannot define both ${includeArgName} and ${excludeArgName}") + } + + if (include) { + return include.contains(item) + } + if (exclude) { + return !exclude.contains(item) + } + + return true +} diff --git a/target/nextflow/workflows/run_benchmark/main.nf b/target/nextflow/workflows/run_benchmark/main.nf new file mode 100644 index 0000000..4a80e82 --- /dev/null +++ b/target/nextflow/workflows/run_benchmark/main.nf @@ -0,0 +1,3612 @@ +// run_benchmark 1.0.0 +// +// This wrapper script is auto-generated by viash 0.9.0 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value instanceof String) { + try { + value = value.toInteger() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigInteger) { + value = value.intValue() + } + expectedClass = value instanceof Integer ? null : "Integer" + } else if (par.type == "long") { + // cast to long if need be + if (value instanceof String) { + try { + value = value.toLong() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof Integer) { + value = value.toLong() + } + expectedClass = value instanceof Long ? null : "Long" + } else if (par.type == "double") { + // cast to double if need be + if (value instanceof String) { + try { + value = value.toDouble() + } catch (NumberFormatException e) { + // do nothing + } + } + if (value instanceof java.math.BigDecimal) { + value = value.doubleValue() + } + if (value instanceof Float) { + value = value.toDouble() + } + expectedClass = value instanceof Double ? null : "Double" + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value instanceof String) { + def valueLower = value.toLowerCase() + if (valueLower == "true") { + value = true + } else if (valueLower == "false") { + value = false + } + } + expectedClass = value instanceof Boolean ? null : "Boolean" + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value instanceof GString) { + value = value.toString() + } + expectedClass = value instanceof String ? null : "String" + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required) { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _processOutputValues(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename, inputFiles_, outputFilenames_] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{[yamlFile] + outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ +mkdir -p "\$(dirname '${yamlFile}')" +echo "Storing state as yaml" +echo '${yamlBlob}' > '${yamlFile}' +echo "Copying output files to destination folder" +${copyCommands.join("\n ")} +""" +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (key, value) are the tuples that will be saved to the state.yaml file + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value, inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = val instanceof File ? val.toPath() : val + [value: value_, inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["value", "inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [[key: plainName_, value: value_, inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename, inputPaths, outputFilenames] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutput = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + // check output tuple + | map { id_, output_ -> + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _processOutputValues(output_, meta.config, id_, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && output_.size() == 1) { + output_ = output_.values()[0] + } + + [join_id, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chInitialOutput, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublish = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublish, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + + // remove join_id and meta + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "run_benchmark", + "namespace" : "workflows", + "version" : "1.0.0", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input_train", + "label" : "Training data", + "summary" : "The subset of molecules used for the training dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/train.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_test", + "label" : "Test data", + "summary" : "The subset of molecules used for the test dataset", + "info" : { + "format" : { + "type" : "h5ad", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "uns" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "A unique identifier for the dataset", + "required" : true + }, + { + "name" : "dataset_name", + "type" : "string", + "description" : "Nicely formatted name.", + "required" : true + }, + { + "type" : "string", + "name" : "dataset_url", + "description" : "Link to the original source of the dataset.", + "required" : false + }, + { + "name" : "dataset_reference", + "type" : "string", + "description" : "Bibtex reference of the paper in which the dataset was published.", + "required" : false + }, + { + "name" : "dataset_summary", + "type" : "string", + "description" : "Short description of the dataset.", + "required" : true + }, + { + "name" : "dataset_description", + "type" : "string", + "description" : "Long description of the dataset.", + "required" : true + }, + { + "name" : "dataset_organism", + "type" : "string", + "description" : "The organism of the sample in the dataset.", + "required" : false + }, + { + "name" : "train_sum", + "type" : "integer", + "description" : "The total number of counts in the training dataset.", + "required" : true + } + ] + } + }, + "example" : [ + "resources_test/task_denoising/cxg_immune_cell_atlas/test.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output_scores", + "description" : "A yaml file containing the scores of each of the methods", + "default" : [ + "score_uns.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_method_configs", + "default" : [ + "method_configs.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_metric_configs", + "default" : [ + "metric_configs.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_dataset_info", + "default" : [ + "dataset_uns.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_task_info", + "default" : [ + "task_info.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Method filtering", + "description" : "Use these arguments to filter methods by name. By default, all methods are\nrun. If `--methods_include` is defined, only those methods are run. If\n`--methods_exclude` is defined, all methods except those specified are run.\nThese arguments are mutually exclusive, so only `--methods_include` OR\n`--methods_exclude` can set but not both.\n", + "arguments" : [ + { + "type" : "string", + "name" : "--methods_include", + "description" : "A list of method ids to include. If specified, only these methods will be run.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--methods_exclude", + "description" : "A list of method ids to exclude. If specified, all methods except the ones listed will be run.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/_viash.yaml" + }, + { + "type" : "file", + "path" : "/common/nextflow_helpers/helper.nf" + } + ], + "status" : "enabled", + "dependencies" : [ + { + "name" : "h5ad/extract_uns_metadata", + "repository" : { + "type" : "github", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + }, + { + "name" : "control_methods/no_denoising", + "repository" : { + "type" : "local" + } + }, + { + "name" : "control_methods/perfect_denoising", + "repository" : { + "type" : "local" + } + }, + { + "name" : "methods/alra", + "repository" : { + "type" : "local" + } + }, + { + "name" : "methods/dca", + "repository" : { + "type" : "local" + } + }, + { + "name" : "methods/knn_smoothing", + "repository" : { + "type" : "local" + } + }, + { + "name" : "methods/magic", + "repository" : { + "type" : "local" + } + }, + { + "name" : "methods/scprint", + "repository" : { + "type" : "local" + } + }, + { + "name" : "metrics/mse", + "repository" : { + "type" : "local" + } + }, + { + "name" : "metrics/poisson", + "repository" : { + "type" : "local" + } + } + ], + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "lowmem" : "memory = 20.Gb", + "midmem" : "memory = 50.Gb", + "highmem" : "memory = 100.Gb", + "lowcpu" : "cpus = 5", + "midcpu" : "cpus = 15", + "highcpu" : "cpus = 30", + "lowtime" : "time = 1.h", + "midtime" : "time = 4.h", + "hightime" : "time = 8.h", + "veryhightime" : "time = 24.h" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/home/runner/work/task_denoising/task_denoising/src/workflows/run_benchmark/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "target/nextflow/workflows/run_benchmark", + "viash_version" : "0.9.0", + "git_commit" : "252731bc7276eb8a6a3398dc4bea026ae70eca80", + "git_remote" : "https://github.com/openproblems-bio/task_denoising" + }, + "package_config" : { + "name" : "task_denoising", + "version" : "1.0.0", + "label" : "Denoising", + "summary" : "Removing noise in sparse single-cell RNA-sequencing count data", + "description" : "A key challenge in evaluating denoising methods is the general lack of a ground truth. A\nrecent benchmark study ([Hou et al.,\n2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))\nrelied on flow-sorted datasets, mixture control experiments ([Tian et al.,\n2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk\nRNA-Seq data. Since each of these approaches suffers from specific limitations, it is\ndifficult to combine these different approaches into a single quantitative measure of\ndenoising accuracy. Here, we instead rely on an approach termed molecular\ncross-validation (MCV), which was specifically developed to quantify denoising accuracy\nin the absence of a ground truth ([Batson et al.,\n2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules\nin a given scRNA-Seq dataset are first partitioned between a *training* and a *test*\ndataset. Next, a denoising method is applied to the training dataset. Finally, denoising\naccuracy is measured by comparing the result to the test dataset. The authors show that\nboth in theory and in practice, the measured denoising accuracy is representative of the\naccuracy that would be obtained on a ground truth dataset.\n", + "info" : { + "image" : "thumbnail.svg", + "motivation" : "Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present\nin each cell. As a result, the measurements (UMI counts) observed for each gene and each\ncell are associated with generally high levels of technical noise ([Grün et al.,\n2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of\nestimating the true expression level of each gene in each cell. In the single-cell\nliterature, this task is also referred to as *imputation*, a term which is typically\nused for missing data problems in statistics. Similar to the use of the terms \\"dropout\\",\n\\"missing data\\", and \\"technical zeros\\", this terminology can create confusion about the\nunderlying measurement process ([Sarkar and Stephens,\n2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).\n", + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/task_denoising/", + "dest" : "resources_test/task_denoising" + }, + { + "type" : "s3", + "path" : "s3://openproblems-data/resources_test/common/", + "dest" : "resources_test/common" + } + ] + }, + "repositories" : [ + { + "type" : "github", + "name" : "core", + "repo" : "openproblems-bio/core", + "tag" : "build/main", + "path" : "viash/core" + } + ], + "viash_version" : "0.9.0", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".runners[.type == \\"nextflow\\"].config.labels := { lowmem : \\"memory = 20.Gb\\", midmem : \\"memory = 50.Gb\\", highmem : \\"memory = 100.Gb\\", lowcpu : \\"cpus = 5\\", midcpu : \\"cpus = 15\\", highcpu : \\"cpus = 30\\", lowtime : \\"time = 1.h\\", midtime : \\"time = 4.h\\", hightime : \\"time = 8.h\\", veryhightime : \\"time = 24.h\\" }" + ], + "authors" : [ + { + "name" : "Wesley Lewis", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "wes-lewis" + } + }, + { + "name" : "Scott Gigante", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "github" : "scottgigante", + "orcid" : "0000-0002-4544-2764" + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X" + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361" + } + } + ], + "keywords" : [ + "single-cell", + "openproblems", + "benchmark", + "denoising" + ], + "license" : "MIT", + "organization" : "openproblems-bio", + "links" : { + "repository" : "https://github.com/openproblems-bio/task_denoising", + "docker_registry" : "ghcr.io", + "issue_tracker" : "https://github.com/openproblems-bio/task_denoising/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { extract_uns_metadata } from "${meta.root_dir}/dependencies/github/openproblems-bio/core/build/main/nextflow/h5ad/extract_uns_metadata/main.nf" +include { no_denoising } from "${meta.resources_dir}/../../../nextflow/control_methods/no_denoising/main.nf" +include { perfect_denoising } from "${meta.resources_dir}/../../../nextflow/control_methods/perfect_denoising/main.nf" +include { alra } from "${meta.resources_dir}/../../../nextflow/methods/alra/main.nf" +include { dca } from "${meta.resources_dir}/../../../nextflow/methods/dca/main.nf" +include { knn_smoothing } from "${meta.resources_dir}/../../../nextflow/methods/knn_smoothing/main.nf" +include { magic } from "${meta.resources_dir}/../../../nextflow/methods/magic/main.nf" +include { scprint } from "${meta.resources_dir}/../../../nextflow/methods/scprint/main.nf" +include { mse } from "${meta.resources_dir}/../../../nextflow/metrics/mse/main.nf" +include { poisson } from "${meta.resources_dir}/../../../nextflow/metrics/poisson/main.nf" + +// inner workflow +// user-provided Nextflow code +include { checkItemAllowed } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + no_denoising, + perfect_denoising, + alra, + dca, + knn_smoothing, + magic, + scprint + ] + + // construct list of metrics + metrics = [ + mse, + poisson + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_uns_metadata.run( + fromState: [input: "input_test"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a defined method or all methods + filter: { id, state, comp -> + def method_check = checkItemAllowed( + comp.config.name, + state.methods_include, + state.methods_exclude, + "methods_include", + "methods_exclude" + ) + + method_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_train: "input_train", + input_test: "input_test" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_test: "input_test", + input_prediction: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.name, + metric_output: output.output + ] + } + ) + + // extract the scores + | extract_uns_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + ["output", [output_scores: score_uns_file]] + } + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + // NOTE: the 'denoising' task doesn't use normalized data, + // so code related to normalization_ids is commented out + + // extract the dataset metadata + meta_ch = dataset_ch + // // only keep one of the normalization methods + // | filter{ id, state -> + // state.dataset_uns.normalization_id == "log_cp10k" + // } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + // uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("_viash.yaml") + + // create output + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_dataset_info: dataset_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + output_ch = score_ch + | mix(meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/run_benchmark/nextflow.config b/target/nextflow/workflows/run_benchmark/nextflow.config new file mode 100644 index 0000000..5df17d7 --- /dev/null +++ b/target/nextflow/workflows/run_benchmark/nextflow.config @@ -0,0 +1,86 @@ +manifest { + name = 'workflows/run_benchmark' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = '1.0.0' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: lowmem { memory = 20.Gb } + withLabel: midmem { memory = 50.Gb } + withLabel: highmem { memory = 100.Gb } + withLabel: lowcpu { cpus = 5 } + withLabel: midcpu { cpus = 15 } + withLabel: highcpu { cpus = 30 } + withLabel: lowtime { time = 1.h } + withLabel: midtime { time = 4.h } + withLabel: hightime { time = 8.h } + withLabel: veryhightime { time = 24.h } +} + + diff --git a/thumbnail.svg b/thumbnail.svg new file mode 100644 index 0000000..65936f0 --- /dev/null +++ b/thumbnail.svg @@ -0,0 +1 @@ +dim-2dim-1dim-2dim-1 \ No newline at end of file