From 5ac2ae87372d44c2a072846d1682779c200c01af Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 17 Sep 2024 08:30:25 +0200 Subject: [PATCH] Initial commit --- .github/ISSUE_TEMPLATE/bug_report.md | 24 +++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/ISSUE_TEMPLATE/feature_request.md | 20 ++ .github/PULL_REQUEST_TEMPLATE.md | 17 ++ .github/workflows/build.yaml | 21 ++ .github/workflows/test.yaml | 11 + .gitignore | 10 + .gitmodules | 3 + .vscode/settings.json | 10 + CHANGELOG.md | 26 +++ CONTRIBUTING.md | 148 +++++++++++++ LICENSE | 21 ++ README.md | 37 ++++ _viash.yaml | 88 ++++++++ common | 1 + main.nf | 3 + nextflow.config | 1 + scripts/create_component/.gitignore | 2 + .../create_component/create_python_method.sh | 8 + .../create_component/create_python_metric.sh | 8 + scripts/create_component/create_r_method.sh | 8 + scripts/create_component/create_r_metric.sh | 8 + scripts/create_readme.sh | 5 + scripts/create_resources/resources.sh | 34 +++ scripts/create_resources/test_resources.sh | 49 +++++ scripts/project/build_all_components.sh | 6 + .../project/build_all_docker_containers.sh | 7 + scripts/project/test_all_components.sh | 6 + scripts/run_benchmark/run_full_local.sh | 47 +++++ scripts/run_benchmark/run_full_seqeracloud.sh | 40 ++++ scripts/run_benchmark/run_test_local.sh | 40 ++++ scripts/run_benchmark/run_test_seqeracloud.sh | 36 ++++ scripts/sync_resources.sh | 5 + src/api/comp_control_method.yaml | 37 ++++ src/api/comp_data_processor.yaml | 31 +++ src/api/comp_method.yaml | 28 +++ src/api/comp_metric.yaml | 28 +++ src/api/file_common_dataset.yaml | 72 +++++++ src/api/file_prediction.yaml | 26 +++ src/api/file_score.yaml | 31 +++ src/api/file_solution.yaml | 73 +++++++ src/api/file_test_h5ad.yaml | 45 ++++ src/api/file_train_h5ad.yaml | 49 +++++ .../true_labels/config.vsh.yaml | 59 ++++++ src/control_methods/true_labels/script.py | 45 ++++ .../process_dataset/config.vsh.yaml | 34 +++ src/data_processors/process_dataset/script.py | 86 ++++++++ .../logistic_regression/config.vsh.yaml | 79 +++++++ src/methods/logistic_regression/script.py | 46 +++++ src/metrics/accuracy/config.vsh.yaml | 70 +++++++ src/metrics/accuracy/script.py | 47 +++++ .../process_datasets/config.vsh.yaml | 38 ++++ src/workflows/process_datasets/main.nf | 173 ++++++++++++++++ src/workflows/process_datasets/test.sh | 33 +++ src/workflows/run_benchmark/config.vsh.yaml | 72 +++++++ src/workflows/run_benchmark/main.nf | 194 ++++++++++++++++++ 56 files changed, 2147 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/build.yaml create mode 100644 .github/workflows/test.yaml create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 .vscode/settings.json create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 _viash.yaml create mode 160000 common create mode 100644 main.nf create mode 100644 nextflow.config create mode 100644 scripts/create_component/.gitignore create mode 100755 scripts/create_component/create_python_method.sh create mode 100755 scripts/create_component/create_python_metric.sh create mode 100755 scripts/create_component/create_r_method.sh create mode 100755 scripts/create_component/create_r_metric.sh create mode 100755 scripts/create_readme.sh create mode 100755 scripts/create_resources/resources.sh create mode 100755 scripts/create_resources/test_resources.sh create mode 100755 scripts/project/build_all_components.sh create mode 100755 scripts/project/build_all_docker_containers.sh create mode 100755 scripts/project/test_all_components.sh create mode 100755 scripts/run_benchmark/run_full_local.sh create mode 100755 scripts/run_benchmark/run_full_seqeracloud.sh create mode 100755 scripts/run_benchmark/run_test_local.sh create mode 100755 scripts/run_benchmark/run_test_seqeracloud.sh create mode 100755 scripts/sync_resources.sh create mode 100644 src/api/comp_control_method.yaml create mode 100644 src/api/comp_data_processor.yaml create mode 100644 src/api/comp_method.yaml create mode 100644 src/api/comp_metric.yaml create mode 100644 src/api/file_common_dataset.yaml create mode 100644 src/api/file_prediction.yaml create mode 100644 src/api/file_score.yaml create mode 100644 src/api/file_solution.yaml create mode 100644 src/api/file_test_h5ad.yaml create mode 100644 src/api/file_train_h5ad.yaml create mode 100644 src/control_methods/true_labels/config.vsh.yaml create mode 100644 src/control_methods/true_labels/script.py create mode 100644 src/data_processors/process_dataset/config.vsh.yaml create mode 100644 src/data_processors/process_dataset/script.py create mode 100644 src/methods/logistic_regression/config.vsh.yaml create mode 100644 src/methods/logistic_regression/script.py create mode 100644 src/metrics/accuracy/config.vsh.yaml create mode 100644 src/metrics/accuracy/script.py create mode 100644 src/workflows/process_datasets/config.vsh.yaml create mode 100644 src/workflows/process_datasets/main.nf create mode 100755 src/workflows/process_datasets/test.sh create mode 100644 src/workflows/run_benchmark/config.vsh.yaml create mode 100644 src/workflows/run_benchmark/main.nf diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..9a8a64b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: [bug] +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..a49eab2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..c17d3c0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: [enhancement] +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..3717137 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,17 @@ +## Describe your changes + + + +## Checklist before requesting a review +- [ ] I have performed a self-review of my code + +- Check the correct box. Does this PR contain: + - [ ] Breaking changes + - [ ] New functionality + - [ ] Major changes + - [ ] Minor changes + - [ ] Bug fixes + +- [ ] Proposed changes are described in the CHANGELOG.md + +- [ ] CI Tests succeed and look good! \ No newline at end of file diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..804a4cb --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,21 @@ +name: Build + +on: + push: + branches: [ 'main' ] + workflow_dispatch: + inputs: + version: + description: | + The version of the project to build. Example: `1.0.3`. + + If not provided, a development build with a version name + based on the branch name will be built. Otherwise, a release + build with the provided version will be built. + required: false + +jobs: + build: + uses: viash-io/viash-actions/.github/workflows/build.yaml@v6 + with: + version: ${{ github.event.inputs.version }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 0000000..1b8d1db --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,11 @@ +name: Test + +on: + push: + branches: + - main + pull_request: + +jobs: + test: + uses: viash-io/viash-actions/.github/workflows/test.yaml@v6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..169fec0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +/resources +/resources_test +/work +/.nextflow* +/target + +.DS_Store +/output +trace-* +.ipynb_checkpoints diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c07c083 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "common"] + path = common + url = https://github.com/openproblems-bio/common_resources.git diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..7529210 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "yaml.schemas": { + "common/schemas/api_component_spec.yaml": "**/api/comp_*.yaml", + "common/schemas/api_file_format.yaml": "**/api/file_*.yaml", + "common/schemas/task_config.yaml": "_viash.yaml", + "common/schemas/task_method.yaml": "**/methods/**/config.vsh.yaml", + "common/schemas/task_control_method.yaml": "**/control_methods/**/config.vsh.yaml", + "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml" + } +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3839744 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,26 @@ +# task_template x.y.z + +## BREAKING CHANGES + + + +## NEW FUNCTIONALITY + +* Added `control_methods/true_labels` component (PR #5). + +* Added `methods/logistic_regression` component (PR #5). + +* Added `metrics/accuracy` component (PR #5). + +## MAJOR CHANGES + +* Updated `api` files (PR #5). + +* Updated configs, components and CI to the latest Viash version (PR #8). + +## MINOR CHANGES + +* Updated `README.md` (PR #5). + +## BUGFIXES + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..abf8c37 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,148 @@ +# Contribution guidelines + +We encourage contributions from the community. To contribute: + +* Star this repository: Click the star button in the top-right corner of the repository to show your support. +* Fork the repository: Start by forking this repository to your account. +* Develop your component: Create your Viash component, ensuring it aligns with our best practices (detailed below). +* Submit a pull request: After testing your component, submit a pull request for review. + +## Installation + +You need to have Docker, Java, and Viash installed. Follow +[these instructions](https://openproblems.bio/documentation/fundamentals/requirements) +to install the required dependencies. + +## Getting started + +### Cloning the repository + +To get started, fork the repository and clone it to your local machine: + +```bash +git clone --recursive + +cd +``` + +NOTE: If you forgot to clone the repository with the `--recursive` flag, you can run the following command to update the submodules: + +```bash +git submodule update --init --recursive +``` + +### Downloading the test resources + +Next, you should download the test resources: + +```bash +scripts/sync_resources.sh +``` + +You may need to run this script again if the resources are updated. + +## Good first steps + +### Creating a new method + +To create a new method, you can use the following command: + +```bash +# in Python: +common/scripts/create_component \ + --name my_method \ + --language python \ + --type method + +# or in R: +common/scripts/create_component \ + --name my_method \ + --language r \ + --type method +``` + +This will create a new method in `src/methods/my_method`. Next, you should: + +* Fill in the component's metadata +* Specify dependencies +* Implement the method's code +* Run the unit test + +Please review our documentation on [creating a new method](https://openproblems.bio/documentation/create_component/add_a_method) for more information on how to do this. + + +### Creating a new metric + +Creating a new metric is similar to creating a new method. You can use the following command: + +```bash +# in Python: +common/scripts/create_component \ + --name my_metric \ + --language python \ + --type metric + +# or in R: +common/scripts/create_component \ + --name my_metric \ + --language r \ + --type metric +``` + +This will create a new metric in `src/metrics/my_metric`. Next, you should: + +* Fill in the component's metadata +* Specify dependencies +* Implement the metric's code +* Run the unit test + +Please review our documentation on [creating a new metric](https://openproblems.bio/documentation/create_component/add_a_metric) for more information. + + +## Frequently used commands + +To get started, you can run the following commands: + +### Sync the test data + +To sync the test data, run the following command: + +```bash +scripts/sync_resources.sh +``` + +### Building the components + +To run the benchmark, you first need to build the components. + +```bash +viash ns build --parallel --setup cachedbuild +``` + +For each of the components, this will: + +* Build a Docker image +* Build an executable at `target/executable/` +* Build a Nextflow module at `target/nextflow/` + +### Running the unit tests + +To run the unit test of one component, you can use the following command: + +```bash +viash test src/path/to/config.vsh.yaml +``` + +To run the unit tests for all components, you can use the following command: + +```bash +viash ns test --parallel +``` + +### Running the benchmark + +To run the benchmark, you can use the following command: + +```bash +scripts/run_benchmark/run.sh +``` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3a85904 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Open Problems in Single-Cell Analysis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..da3ffe5 --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +# Task Template + +This repo is a template to create a new task for the OpenProblems v2. This repo contains several example files and components that can be used when updated with the task info. + +> [!WARNING] +> This README will be overwritten when performing the `create_task_readme` script. + +## Create a repository from this template + +> [!IMPORTANT] +> Before creating a new repository, make sure you are part of the OpenProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task. +> For more information on how to create a new task, check out the [Create a new task](https://openproblems.bio/documentation/create_task/) documentation. + +The instructions below will guide you through creating a new repository from this template ([creating-a-repository-from-a-template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)). + + +* Click the "Use this template" button on the top right of the repository. +* Use the Owner dropdown menu to select the `openproblems-bio` account. +* Type a name for your repository (task_...), and a description. +* Set the repository visibility to public. +* Click "Create repository from template". + +## Clone the repository + +To clone the repository with the submodule files, you can use the following command: + +```bash +git clone --recursive git@github.com:openproblems-bio/.git +``` +>[!NOTE] +> If somehow there are no files visible in the submodule after cloning using the above command. Check the instructions [here](common/README.md). + +## What to do next + +Check out the [instructions](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. + +For more information on the OpenProblems v2, check out the [documentation](https://openproblems.bio/documentation/). \ No newline at end of file diff --git a/_viash.yaml b/_viash.yaml new file mode 100644 index 0000000..fe4c625 --- /dev/null +++ b/_viash.yaml @@ -0,0 +1,88 @@ +viash_version: 0.9.0 + +# Step 1: Change the name of the task. +# example: task_name_of_this_task +name: task_template +organization: openproblems-bio +version: dev + +license: MIT +# Step 2: Add keywords to describe the task. +keywords: [single-cell, openproblems, benchmark] +# Step 3: Update the `task_template` to the name of the task from step 1. +links: + issue_tracker: https://github.com/openproblems-bio/task_template/issues + repository: https://github.com/openproblems-bio/task_template + docker_registry: ghcr.io + + +# Step 4: Update the label, summary and description. +# A unique, human-readable, short label. Used for creating summary tables and visualisations. +label: Template +summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. +description: | + Provide a clear and concise description of your task, detailing the specific problem it aims + to solve. Outline the input data types, the expected output, and any assumptions or constraints. + Be sure to explain any terminology or concepts that are essential for understanding the task. + + Explain the motivation behind your proposed task. Describe the biological or computational + problem you aim to address and why it's important. Discuss the current state of research in + this area and any gaps or challenges that your task could help address. This section + should convince readers of the significance and relevance of your task. + +# A list of references to relevant literature. Each reference should be a DOI or a bibtex entry +references: + doi: + - 10.21203/rs.3.rs-4181617/v1 + # bibtex: + # - | + # @article{doe_2021_template, + # doi = {10.21203/rs.3.rs-4181617/v1}, + # url = {https://doi.org/10.21203/rs.3.rs-4181617/v1}, + # author = {Doe, John}, + # title = {A template for creating new tasks}, + # publisher = {Research Square}, + # year = {2021}, + # } + +info: + image: The name of the image file to use for the component on the website. + # Step 5: Replace the task_template to the name of the task. + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/task_template/ + dest: resources_test/task_template + - type: s3 + path: s3://openproblems-data/resources_test/common/ + dest: resources_test/common + +# Step 6: Update the authors of the task. +authors: + # Full name of the author, usually in the name of FirstName MiddleName LastName. + - name: John Doe + # Role of the author. Possible values: + # + # * `"author"`: Authors who have made substantial contributions to the component. + # * `"maintainer"`: The maintainer of the component. + # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). + roles: [ "author", "maintainer" ] + # Additional information on the author + info: + github: johndoe + orcid: 0000-0000-0000-0000 + email: john@doe.me + twitter: johndoe + linkedin: johndoe + +# Step 7: Remove all of the comments of the steps you completed +# Step 8: High five yourself! + +config_mods: | + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } + +repositories: + - name: core + type: github + repo: openproblems-bio/core + tag: build/main + path: viash/core diff --git a/common b/common new file mode 160000 index 0000000..bf64ebc --- /dev/null +++ b/common @@ -0,0 +1 @@ +Subproject commit bf64ebcaef096f37013733351a08671f7caca896 diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..62f0140 --- /dev/null +++ b/main.nf @@ -0,0 +1,3 @@ +workflow { + print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.") +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..8fc6c4e --- /dev/null +++ b/nextflow.config @@ -0,0 +1 @@ +process.container = 'nextflow/bash:latest' \ No newline at end of file diff --git a/scripts/create_component/.gitignore b/scripts/create_component/.gitignore new file mode 100644 index 0000000..09380f9 --- /dev/null +++ b/scripts/create_component/.gitignore @@ -0,0 +1,2 @@ +# if users change the scripts, the changes should not be committed. +/create_*_*.sh \ No newline at end of file diff --git a/scripts/create_component/create_python_method.sh b/scripts/create_component/create_python_method.sh new file mode 100755 index 0000000..b96c05d --- /dev/null +++ b/scripts/create_component/create_python_method.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_python_method \ + --language python \ + --type method diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh new file mode 100755 index 0000000..d36bc7a --- /dev/null +++ b/scripts/create_component/create_python_metric.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_python_metric \ + --language python \ + --type metric diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh new file mode 100755 index 0000000..0ab0394 --- /dev/null +++ b/scripts/create_component/create_r_method.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_r_method \ + --language r \ + --type method diff --git a/scripts/create_component/create_r_metric.sh b/scripts/create_component/create_r_metric.sh new file mode 100755 index 0000000..1a4794e --- /dev/null +++ b/scripts/create_component/create_r_metric.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_r_metric \ + --language r \ + --type metric diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh new file mode 100755 index 0000000..0ed7aaf --- /dev/null +++ b/scripts/create_readme.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +common/scripts/create_task_readme --input src/api diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh new file mode 100755 index 0000000..ccfd5fe --- /dev/null +++ b/scripts/create_resources/resources.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# remove this when you have implemented the script +echo "TODO: once the 'process_datasets' workflow is implemented, update this script to use it." +echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 2: replace the rename keys parameters to fit your process_dataset inputs" +echo " Step 3: replace the settings parameter to fit your process_dataset outputs" +echo " Step 4: remove this message" +exit 1 + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources/datasets/**/state.yaml +rename_keys: 'input:output_dataset' +output_state: '$id/state.yaml' +settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' +publish_dir: s3://openproblems-data/resources/task_template/datasets/ +HERE + +tw launch https://github.com/openproblems-bio/task_template.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_template,process_datasets diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh new file mode 100755 index 0000000..2b3378b --- /dev/null +++ b/scripts/create_resources/test_resources.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# remove this when you have implemented the script +echo "TODO: replace the commands in this script with the sequence of components that you need to run to generate test_resources." +echo " Inside this script, you will need to place commands to generate example files for each of the 'src/api/file_*.yaml' files." +exit 1 + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/task_template + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + --publish_dir "$DATASET_DIR" \ + --id "pancreas" \ + --input "$RAW_DATA/pancreas/dataset.h5ad" \ + --output_train '$id/train.h5ad' \ + --output_test '$id/test.h5ad' \ + --output_solution '$id/solution.h5ad' \ + --output_state '$id/state.yaml' + +# run one method +viash run src/methods/knn/config.vsh.yaml -- \ + --input_train $DATASET_DIR/pancreas/train.h5ad \ + --input_test $DATASET_DIR/pancreas/test.h5ad \ + --output $DATASET_DIR/pancreas/prediction.h5ad + +# run one metric +viash run src/metrics/accuracy/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/pancreas/prediction.h5ad \ + --input_solution $DATASET_DIR/pancreas/solution.h5ad \ + --output $DATASET_DIR/pancreas/score.h5ad + +# only run this if you have access to the openproblems-data bucket +aws s3 sync --profile op \ + "$DATASET_DIR" s3://openproblems-data/resources_test/task_template \ + --delete --dryrun diff --git a/scripts/project/build_all_components.sh b/scripts/project/build_all_components.sh new file mode 100755 index 0000000..4e90d91 --- /dev/null +++ b/scripts/project/build_all_components.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e + +# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html) +viash ns build --parallel diff --git a/scripts/project/build_all_docker_containers.sh b/scripts/project/build_all_docker_containers.sh new file mode 100755 index 0000000..5d43639 --- /dev/null +++ b/scripts/project/build_all_docker_containers.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e + +# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html) +# and set up the container via a cached build +viash ns build --parallel --setup cachedbuild diff --git a/scripts/project/test_all_components.sh b/scripts/project/test_all_components.sh new file mode 100755 index 0000000..8a08afd --- /dev/null +++ b/scripts/project/test_all_components.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e + +# Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html) +viash ns test --parallel diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh new file mode 100755 index 0000000..8c63393 --- /dev/null +++ b/scripts/run_benchmark/run_full_local.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# NOTE: depending on the the datasets and components, you may need to launch this workflow +# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud). +# please refer to the nextflow information for more details: +# https://www.nextflow.io/docs/latest/ + +# remove this when you have implemented the script +echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +echo " Step 4: remove this message" +exit 1 + +set -e + +echo "Running benchmark on test data" +echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" + +# generate a unique id +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="resources/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: resources/datasets/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +# run the benchmark +nextflow run openproblems-bio/task_template \ + --revision build/main \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + -params-file /tmp/params.yaml diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh new file mode 100755 index 0000000..87d133c --- /dev/null +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# remove this when you have implemented the script +echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +echo " Step 4: remove this message" +exit 1 + +set -e + +# generate a unique id +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_template.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_template,full \ No newline at end of file diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh new file mode 100755 index 0000000..e549635 --- /dev/null +++ b/scripts/run_benchmark/run_test_local.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# remove this when you have implemented the script +echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +echo " Step 4: remove this message" +exit 1 + +set -e + +echo "Running benchmark on test data" +echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" + +# generate a unique id +RUN_ID="testrun_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="resources/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources_test/task_template/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + -params-file /tmp/params.yaml diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh new file mode 100755 index 0000000..ac91020 --- /dev/null +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# remove this when you have implemented the script +echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +echo " Step 1: replace 'task_template' with the name of the task in the following command." +echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +echo " Step 4: remove this message" +exit 1 + +set -e + +# write the parameters to file +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources_test/task_template/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/task_template/ +HERE + +tw launch https://github.com/openproblems-bio/task_template.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_template,test diff --git a/scripts/sync_resources.sh b/scripts/sync_resources.sh new file mode 100755 index 0000000..20b87e7 --- /dev/null +++ b/scripts/sync_resources.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +common/scripts/sync_resources diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml new file mode 100644 index 0000000..4d767d8 --- /dev/null +++ b/src/api/comp_control_method.yaml @@ -0,0 +1,37 @@ +namespace: control_methods +info: + type: control_method + type_info: + label: Control Method + summary: Quality control methods for verifying the pipeline. + description: | + This folder contains control components for the task. + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. +arguments: + - name: --input_train + __merge__: file_train_h5ad.yaml + required: true + direction: input + - name: --input_test + __merge__: file_test_h5ad.yaml + required: true + direction: input + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas \ No newline at end of file diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml new file mode 100644 index 0000000..184bc54 --- /dev/null +++ b/src/api/comp_data_processor.yaml @@ -0,0 +1,31 @@ +namespace: "data_processors" +info: + type: data_processor + type_info: + label: Data processor + summary: A data processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train_h5ad.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test_h5ad.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py + diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml new file mode 100644 index 0000000..d7be957 --- /dev/null +++ b/src/api/comp_method.yaml @@ -0,0 +1,28 @@ +namespace: "methods" +info: + type: method + type_info: + label: Method + summary: A method. + description: | + A method to predict the task effects. +arguments: + - name: --input_train + __merge__: file_train_h5ad.yaml + required: true + direction: input + - name: "--input_test" + __merge__: file_test_h5ad.yaml + direction: input + required: true + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml new file mode 100644 index 0000000..e3295da --- /dev/null +++ b/src/api/comp_metric.yaml @@ -0,0 +1,28 @@ +namespace: "metrics" +info: + type: metric + type_info: + label: Metric + summary: A task template metric. + description: | + A metric for evaluating method predictions. +arguments: + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--input_prediction" + __merge__: file_prediction.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml new file mode 100644 index 0000000..0927ea0 --- /dev/null +++ b/src/api/file_common_dataset.yaml @@ -0,0 +1,72 @@ +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +label: "Common Dataset" +summary: A subset of the common dataset. +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Cell type information + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml new file mode 100644 index 0000000..4a6dc32 --- /dev/null +++ b/src/api/file_prediction.yaml @@ -0,0 +1,26 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/prediction.h5ad" +label: "Predicted data" +summary: A predicted dataset as output by a method. +info: + format: + type: h5ad + obs: + - type: string + name: label_pred + description: Predicted labels for the test cells. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true \ No newline at end of file diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml new file mode 100644 index 0000000..f6022a8 --- /dev/null +++ b/src/api/file_score.yaml @@ -0,0 +1,31 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: resources/score.h5ad +label: Score +summary: "File indicating the score of a metric." +info: + format: + type: h5ad + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true \ No newline at end of file diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml new file mode 100644 index 0000000..81e168e --- /dev/null +++ b/src/api/file_solution.yaml @@ -0,0 +1,73 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/solution.h5ad" +label: "Solution" +summary: "The solution for the test data" +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml new file mode 100644 index 0000000..6ee21ac --- /dev/null +++ b/src/api/file_test_h5ad.yaml @@ -0,0 +1,45 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/test.h5ad" +label: "Test data" +summary: The subset of molecules used for the test dataset +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true \ No newline at end of file diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_train_h5ad.yaml new file mode 100644 index 0000000..7d2b51d --- /dev/null +++ b/src/api/file_train_h5ad.yaml @@ -0,0 +1,49 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/train.h5ad" +label: "Training data" +summary: "The training data in h5ad format" +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true \ No newline at end of file diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml new file mode 100644 index 0000000..741e3f2 --- /dev/null +++ b/src/control_methods/true_labels/config.vsh.yaml @@ -0,0 +1,59 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_control_method.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: true_labels + +# A relatively short label, used when rendering visualisations (required) +label: True Labels +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + A positive control, where the solution labels are copied 1 to 1 to the predicted data. + +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: counts + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + # setup: + # - type: python + # packages: scib==1.1.5 + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py new file mode 100644 index 0000000..0a04aaf --- /dev/null +++ b/src/control_methods/true_labels/script.py @@ -0,0 +1,45 @@ +import anndata as ad + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_train': 'resources_test/task_template/pancreas/train.h5ad', + 'input_test': 'resources_test/task_template/pancreas/test.h5ad', + 'input_solution': 'resources_test/task_template/pancreas/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'true_labels' +} +## VIASH END + +print('Reading input files', flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Preprocess data', flush=True) +# ... preprocessing ... + +print('Train model', flush=True) +# ... train model ... + +print('Generate predictions', flush=True) +# ... generate predictions ... +obs_label_pred = input_solution.obs["label"] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_train.uns['dataset_id'], + 'normalization_id': input_train.uns['normalization_id'], + 'method_id': meta['name'] + }, + obs={ + 'label_pred': obs_label_pred + } +) +output.obs_names = input_test.obs_names + +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml new file mode 100644 index 0000000..a997720 --- /dev/null +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_data_processor.yaml +name: process_dataset +arguments: + - name: "--method" + type: "string" + description: "The process method to assign train/test." + choices: ["batch", "random"] + default: "batch" + - name: "--obs_label" + type: "string" + description: "Which .obs slot to use as label." + default: "cell_type" + - name: "--obs_batch" + type: "string" + description: "Which .obs slot to use as batch covariate." + default: "batch" + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 +resources: + - type: python_script + path: script.py + - path: /common/helper_functions/subset_h5ad_by_format.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py new file mode 100644 index 0000000..350d556 --- /dev/null +++ b/src/data_processors/process_dataset/script.py @@ -0,0 +1,86 @@ +import sys +import random +import numpy as np +import anndata as ad +import openproblems as op + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'method': 'batch', + 'seed': None, + 'obs_batch': 'batch', + 'obs_label': 'cell_type', + 'output_train': 'train.h5ad', + 'output_test': 'test.h5ad', + 'output_solution': 'solution.h5ad' +} +meta = { + 'resources_dir': 'target/executable/data_processors/process_dataset', + 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_h5ad_by_format import subset_h5ad_by_format + +config = op.project.read_viash_config(meta["config"]) + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print(">> Load data", flush=True) +adata = ad.read_h5ad(par["input"]) +print("input:", adata) + +print(f">> Process data using {par['method']} method") +if par["method"] == "batch": + batch_info = adata.obs[par["obs_batch"]] + batch_categories = batch_info.dtype.categories + test_batches = random.sample(list(batch_categories), 1) + is_test = [ x in test_batches for x in batch_info ] +elif par["method"] == "random": + train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) + is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] + +# subset the different adatas +print(">> Figuring which data needs to be copied to which output file", flush=True) +# use par arguments to look for label and batch value in different slots +slot_mapping = { + "obs": { + "label": par["obs_label"], + "batch": par["obs_batch"], + } +} + +print(">> Creating train data", flush=True) +output_train = subset_h5ad_by_format( + adata[[not x for x in is_test]], + config, + "output_train", + slot_mapping +) + +print(">> Creating test data", flush=True) +output_test = subset_h5ad_by_format( + adata[is_test], + config, + "output_test", + slot_mapping +) + +print(">> Creating solution data", flush=True) +output_solution = subset_h5ad_by_format( + adata[is_test], + config, + "output_solution", + slot_mapping +) + +print(">> Writing data", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +output_solution.write_h5ad(par["output_solution"]) diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml new file mode 100644 index 0000000..479aa3a --- /dev/null +++ b/src/methods/logistic_regression/config.vsh.yaml @@ -0,0 +1,79 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_method.yaml + + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: logistic_regression +# A relatively short label, used when rendering visualisations (required) +label: Logistic Regression +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + Logistic Regression estimates parameters of a logistic function for + multivariate classification tasks. Here, we use 100-dimensional whitened PCA + coordinates as independent variables, and the model minimises the cross + entropy loss over all cell type classes. +# Metadata for your component +# A reference key from the bibtex library at src/common/library.bib (required). +references: + bibtex: + - | + @book{hosmer2013applied, + title = {Applied logistic regression}, + author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, + year = {2013}, + publisher = {John Wiley \& Sons}, + volume = {398} + } + +links: + # URL to the code repository for this method (required). + repository: https://github.com/scikit-learn/scikit-learn + # URL to the documentation for this method (required). + documentation: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: log_cp10k + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: scikit-learn + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py new file mode 100644 index 0000000..cc851f8 --- /dev/null +++ b/src/methods/logistic_regression/script.py @@ -0,0 +1,46 @@ +import anndata as ad +import sklearn.linear_model + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_train': 'resources_test/task_template/pancreas/train.h5ad', + 'input_test': 'resources_test/task_template/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'logistic_regression' +} +## VIASH END + +print('Reading input files', flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print('Preprocess data', flush=True) +# ... preprocessing ... + +print('Train model', flush=True) +# ... train model ... +classifier = sklearn.linear_model.LogisticRegression() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) + +print('Generate predictions', flush=True) +# ... generate predictions ... +obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_train.uns['dataset_id'], + 'normalization_id': input_train.uns['normalization_id'], + 'method_id': meta['name'] + }, + obs={ + 'label_pred': obs_label_pred + } +) +output.obs_names = input_test.obs_names + +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml new file mode 100644 index 0000000..66fa835 --- /dev/null +++ b/src/metrics/accuracy/config.vsh.yaml @@ -0,0 +1,70 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: accuracy + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: accuracy + # A relatively short label, used when rendering visualisarions (required) + label: Accuracy + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "The percentage of correctly predicted labels." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + The percentage of correctly predicted labels. + # A reference key from the bibtex library at src/common/library.bib (required). + references: + doi: 10.48550/arXiv.2008.05756 + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: 1 + # Whether a higher value represents a 'better' solution (required) + maximize: true + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: scikit-learn + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py new file mode 100644 index 0000000..72dcb1e --- /dev/null +++ b/src/metrics/accuracy/script.py @@ -0,0 +1,47 @@ +import anndata as ad +import numpy as np +import sklearn.preprocessing + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_solution': 'resources_test/task_template/pancreas/solution.h5ad', + 'input_prediction': 'resources_test/task_template/pancreas/prediction.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'accuracy' +} +## VIASH END + +print('Reading input files', flush=True) +input_solution = ad.read_h5ad(par['input_solution']) +input_prediction = ad.read_h5ad(par['input_prediction']) + +assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +print("Encode labels", flush=True) +cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) +encoder = sklearn.preprocessing.LabelEncoder().fit(cats) +input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) +input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) + + +print('Compute metrics', flush=True) +# metric_ids and metric_values can have length > 1 +# but should be of equal length +uns_metric_ids = [ 'accuracy' ] +uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_prediction.uns['dataset_id'], + 'normalization_id': input_prediction.uns['normalization_id'], + 'method_id': input_prediction.uns['method_id'], + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000..24d03a7 --- /dev/null +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,38 @@ +name: process_datasets +namespace: workflows + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/api/file_common_dataset.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_train" + __merge__: /src/api/file_train_h5ad.yaml + required: true + direction: output + - name: "--output_test" + __merge__: /src/api/file_test_h5ad.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/api/file_solution.yaml + required: true + direction: output + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf + +dependencies: + - name: schema/verify_data_structure + repository: core + - name: data_processors/process_dataset + +runners: + - type: nextflow diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf new file mode 100644 index 0000000..8ffa666 --- /dev/null +++ b/src/workflows/process_datasets/main.nf @@ -0,0 +1,173 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStatesTemp(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | verify_data_structure.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test", "output_solution"]) + + emit: + output_ch +} + + +// temp fix for rename_keys typo + +def findStatesTemp(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesTempWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesTempWf +} \ No newline at end of file diff --git a/src/workflows/process_datasets/test.sh b/src/workflows/process_datasets/test.sh new file mode 100755 index 0000000..d918102 --- /dev/null +++ b/src/workflows/process_datasets/test.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Run this prior to executing this script: +# bin/viash_build -q 'batch_integration' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/common" +OUTPUT_DIR="output/process_datasets_test" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=24.04.3 + +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --id run_test \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000..6b3a3d5 --- /dev/null +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,72 @@ +name: run_benchmark +namespace: workflows + +argument_groups: + - name: Inputs + arguments: + - name: "--input_train" + __merge__: /src/api/file_train_h5ad.yaml + type: file + direction: input + required: true + - name: "--input_test" + __merge__: /src/api/file_test_h5ad.yaml + type: file + direction: input + required: true + - name: "--input_solution" + __merge__: /src/api/file_solution.yaml + type: file + direction: input + required: true + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /_viash.yaml + +dependencies: + - name: h5ad/extract_uns_metadata + repository: core + - name: control_methods/true_labels + - name: methods/logistic_regression + - name: metrics/accuracy + +runners: + - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf new file mode 100644 index 0000000..2ea9016 --- /dev/null +++ b/src/workflows/run_benchmark/main.nf @@ -0,0 +1,194 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +// construct list of methods and control methods +methods = [ + true_labels, + logistic_regression +] + +// construct list of metrics +metrics = [ + accuracy +] + +workflow run_wf { + take: + input_ch + + main: + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_uns_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_train: state.input_train, + input_test: state.input_test + ] + if (comp.config.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_solution: "input_solution", + input_prediction: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.name, + metric_output: output.output + ] + } + ) + + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def viash_file = meta.resources_dir.resolve("_viash.yaml") + def viash_file_content = toYamlBlob(readYaml(viash_file).info) + def task_info_file = tempFile("task_info.yaml") + task_info_file.write(viash_file_content) + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +}