From d11e85087744738f34465c3bcf91910e521f5539 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 26 Aug 2024 15:50:39 +0200
Subject: [PATCH] add viash template

---
 .github/ISSUE_TEMPLATE/bug_report.md          |  24 ++
 .github/ISSUE_TEMPLATE/config.yml             |   1 +
 .github/ISSUE_TEMPLATE/feature_request.md     |  20 ++
 .github/PULL_REQUEST_TEMPLATE.md              |  17 +
 .github/workflows/test.yaml                   |  11 +
 .gitignore                                    |   4 +
 .gitmodules                                   |   3 +
 CHANGELOG.md                                  |  26 ++
 LICENSE                                       |  21 ++
 _viash.yaml                                   |  87 +++++
 common                                        |   1 +
 main.nf                                       |   3 +
 nextflow.config                               |   1 +
 scripts/.gitignore                            |   3 +
 {scripts => scripts_old}/aggregate_counts.py  |   0
 .../aggregate_group_metrics.py                |   0
 {scripts => scripts_old}/aggregate_metrics.py |   0
 .../aggregate_tile_assignments.py             |   0
 .../annotate_celltypes.py                     |   0
 .../annotate_celltypes_FRmatch.r              |   0
 .../annotate_celltypes_consensus_GMCS.py      |   0
 .../annotate_celltypes_consensus_NWCS.py      |   0
 .../annotate_celltypes_mapmycells.py          |   0
 .../annotate_celltypes_mfishtools.r           |   0
 .../annotate_celltypes_scrattchmapping.r      |   0
 .../annotate_celltypes_tangram.py             |   0
 {scripts => scripts_old}/annotate_counts.py   |   0
 {scripts => scripts_old}/basic_assignment.py  |   0
 .../calc_group_metrics.py                     |   0
 {scripts => scripts_old}/calc_metrics.py      |   0
 .../calc_quality_metrics.py                   |   0
 .../copy_celltypes_pciseqct.py                |   0
 {scripts => scripts_old}/gen_counts.py        |   0
 {scripts => scripts_old}/generate_tile.py     |   0
 {scripts => scripts_old}/normalize_sc.py      |   0
 .../retrieve_tiles_info.py                    |   0
 {scripts => scripts_old}/run_baysor.py        |   0
 {scripts => scripts_old}/run_clustermap.py    |   0
 {scripts => scripts_old}/run_mesmer.py        |   0
 {scripts => scripts_old}/run_pciseq.py        |   0
 {scripts => scripts_old}/segment_image.py     |   0
 src/api/comp_control_method.yaml              |  37 +++
 src/api/comp_data_processor.yaml              |  31 ++
 src/api/comp_method.yaml                      |  28 ++
 src/api/comp_metric.yaml                      |  28 ++
 src/api/file_common_dataset.yaml              |  72 ++++
 src/api/file_prediction.yaml                  |  26 ++
 src/api/file_score.yaml                       |  31 ++
 src/api/file_solution.yaml                    |  73 ++++
 src/api/file_test_h5ad.yaml                   |  45 +++
 src/api/file_train_h5ad.yaml                  |  49 +++
 .../true_labels/config.vsh.yaml               |  59 ++++
 src/control_methods/true_labels/script.py     |  45 +++
 .../process_dataset/config.vsh.yaml           |  34 ++
 src/data_processors/process_dataset/script.py |  86 +++++
 .../logistic_regression/config.vsh.yaml       |  77 +++++
 src/methods/logistic_regression/script.py     |  46 +++
 src/metrics/accuracy/config.vsh.yaml          |  70 ++++
 src/metrics/accuracy/script.py                |  47 +++
 .../process_datasets/config.vsh.yaml          |  40 +++
 src/workflows/process_datasets/main.nf        | 173 ++++++++++
 src/workflows/run_benchmark/config.vsh.yaml   |  74 +++++
 src/workflows/run_benchmark/main.nf           | 311 ++++++++++++++++++
 63 files changed, 1704 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .github/workflows/test.yaml
 create mode 100644 .gitmodules
 create mode 100644 CHANGELOG.md
 create mode 100644 LICENSE
 create mode 100644 _viash.yaml
 create mode 160000 common
 create mode 100644 main.nf
 create mode 100644 nextflow.config
 create mode 100644 scripts/.gitignore
 rename {scripts => scripts_old}/aggregate_counts.py (100%)
 rename {scripts => scripts_old}/aggregate_group_metrics.py (100%)
 rename {scripts => scripts_old}/aggregate_metrics.py (100%)
 rename {scripts => scripts_old}/aggregate_tile_assignments.py (100%)
 rename {scripts => scripts_old}/annotate_celltypes.py (100%)
 rename {scripts => scripts_old}/annotate_celltypes_FRmatch.r (100%)
 rename {scripts => scripts_old}/annotate_celltypes_consensus_GMCS.py (100%)
 rename {scripts => scripts_old}/annotate_celltypes_consensus_NWCS.py (100%)
 rename {scripts => scripts_old}/annotate_celltypes_mapmycells.py (100%)
 rename {scripts => scripts_old}/annotate_celltypes_mfishtools.r (100%)
 rename {scripts => scripts_old}/annotate_celltypes_scrattchmapping.r (100%)
 rename {scripts => scripts_old}/annotate_celltypes_tangram.py (100%)
 rename {scripts => scripts_old}/annotate_counts.py (100%)
 rename {scripts => scripts_old}/basic_assignment.py (100%)
 rename {scripts => scripts_old}/calc_group_metrics.py (100%)
 rename {scripts => scripts_old}/calc_metrics.py (100%)
 rename {scripts => scripts_old}/calc_quality_metrics.py (100%)
 rename {scripts => scripts_old}/copy_celltypes_pciseqct.py (100%)
 rename {scripts => scripts_old}/gen_counts.py (100%)
 rename {scripts => scripts_old}/generate_tile.py (100%)
 rename {scripts => scripts_old}/normalize_sc.py (100%)
 rename {scripts => scripts_old}/retrieve_tiles_info.py (100%)
 rename {scripts => scripts_old}/run_baysor.py (100%)
 rename {scripts => scripts_old}/run_clustermap.py (100%)
 rename {scripts => scripts_old}/run_mesmer.py (100%)
 rename {scripts => scripts_old}/run_pciseq.py (100%)
 rename {scripts => scripts_old}/segment_image.py (100%)
 create mode 100644 src/api/comp_control_method.yaml
 create mode 100644 src/api/comp_data_processor.yaml
 create mode 100644 src/api/comp_method.yaml
 create mode 100644 src/api/comp_metric.yaml
 create mode 100644 src/api/file_common_dataset.yaml
 create mode 100644 src/api/file_prediction.yaml
 create mode 100644 src/api/file_score.yaml
 create mode 100644 src/api/file_solution.yaml
 create mode 100644 src/api/file_test_h5ad.yaml
 create mode 100644 src/api/file_train_h5ad.yaml
 create mode 100644 src/control_methods/true_labels/config.vsh.yaml
 create mode 100644 src/control_methods/true_labels/script.py
 create mode 100644 src/data_processors/process_dataset/config.vsh.yaml
 create mode 100644 src/data_processors/process_dataset/script.py
 create mode 100644 src/methods/logistic_regression/config.vsh.yaml
 create mode 100644 src/methods/logistic_regression/script.py
 create mode 100644 src/metrics/accuracy/config.vsh.yaml
 create mode 100644 src/metrics/accuracy/script.py
 create mode 100644 src/workflows/process_datasets/config.vsh.yaml
 create mode 100644 src/workflows/process_datasets/main.nf
 create mode 100644 src/workflows/run_benchmark/config.vsh.yaml
 create mode 100644 src/workflows/run_benchmark/main.nf

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..9a8a64b4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,24 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: [bug]
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..a49eab2f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: true
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..c17d3c0d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: [enhancement]
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..37171375
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,17 @@
+## Describe your changes
+
+<!-- Describe your changes  -->
+
+## Checklist before requesting a review
+- [ ] I have performed a self-review of my code
+
+- Check the correct box. Does this PR contain:
+  - [ ] Breaking changes
+  - [ ] New functionality
+  - [ ] Major changes
+  - [ ] Minor changes
+  - [ ] Bug fixes
+
+- [ ] Proposed changes are described in the CHANGELOG.md
+
+- [ ] CI Tests succeed and look good!
\ No newline at end of file
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
new file mode 100644
index 00000000..7194c6ea
--- /dev/null
+++ b/.github/workflows/test.yaml
@@ -0,0 +1,11 @@
+name: Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  build:
+    uses: viash-io/viash-actions/.github/workflows/test.yaml@v6
diff --git a/.gitignore b/.gitignore
index dac9fd8a..d86eb196 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,7 @@ temp/
 dependency.png
 envs/src/
 singularity_container/
+
+target
+.nextflow
+.resources_test
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..d35f6d23
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "common"]
+	path = common
+	url = git@github.com:openproblems-bio/common_resources.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..38397448
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,26 @@
+# task_template x.y.z
+
+## BREAKING CHANGES
+
+<!-- * Restructured `src` directory (PR #3). -->
+
+## NEW FUNCTIONALITY
+
+* Added `control_methods/true_labels` component (PR #5).
+
+* Added `methods/logistic_regression` component (PR #5).
+
+* Added `metrics/accuracy` component (PR #5).
+
+## MAJOR CHANGES
+
+* Updated `api` files (PR #5).
+
+* Updated configs, components and CI to the latest Viash version (PR #8).
+
+## MINOR CHANGES
+
+* Updated `README.md` (PR #5).
+
+## BUGFIXES
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..3a85904e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Open Problems in Single-Cell Analysis
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/_viash.yaml b/_viash.yaml
new file mode 100644
index 00000000..c4d0ca05
--- /dev/null
+++ b/_viash.yaml
@@ -0,0 +1,87 @@
+viash_version: 0.9.0-RC7
+
+# Step 1: Change the name of the task.
+# example: task_name_of_this_task
+name: task_preprocessing_imagingbased_st
+organization: openproblems-bio
+version: dev
+
+license: MIT
+# Step 2: Add keywords to describe the task.
+keywords: [single-cell, openproblems, benchmark]
+# Step 3: Update the `task_template` to the name of the task from step 1.
+links:
+  issue_tracker: https://github.com/openproblems-bio/task_preprocessing_imagingbased_st/issues
+  repository: https://github.com/openproblems-bio/task_preprocessing_imagingbased_st
+  docker_registry: ghcr.io
+
+
+# Step 4: Update the label, summary and description.
+# A unique, human-readable, short label. Used for creating summary tables and visualisations.
+label: Template
+summary: A one sentence summary of purpose and methodology. Used for creating an overview tables.
+description: |
+  Provide a clear and concise description of your task, detailing the specific problem it aims
+  to solve. Outline the input data types, the expected output, and any assumptions or constraints.
+  Be sure to explain any terminology or concepts that are essential for understanding the task.
+
+  Explain the motivation behind your proposed task. Describe the biological or computational 
+  problem you aim to address and why it's important. Discuss the current state of research in
+  this area and any gaps or challenges that your task could help address. This section 
+  should convince readers of the significance and relevance of your task.
+
+# A list of references to relevant literature. Each reference should be a DOI or a bibtex entry
+references:
+  doi:
+    - 10.21203/rs.3.rs-4181617/v1
+  # bibtex:
+  #   - |
+  #     @article{doe_2021_template,
+  #       doi = {10.21203/rs.3.rs-4181617/v1},
+  #       url = {https://doi.org/10.21203/rs.3.rs-4181617/v1},
+  #       author = {Doe, John},
+  #       title = {A template for creating new tasks},
+  #       publisher = {Research Square},
+  #       year = {2021},
+  #     }
+  
+info:
+  image: The name of the image file to use for the component on the website.
+  # Step 5: Replace the task_template to the name of the task.
+  test_resources:
+    - type: s3
+      path: s3://openproblems-data/resources_test/task_template/
+      dest: resources_test/task_template
+    - type: s3
+      path: s3://openproblems-data/resources_test/common/
+      dest: resources_test/common
+
+# Step 6: Update the authors of the task.
+authors: 
+  # Full name of the author, usually in the name of FirstName MiddleName LastName.
+  - name: John Doe
+    # Role of the author. Possible values:
+    # 
+    # * `"author"`: Authors who have made substantial contributions to the component.
+    # * `"maintainer"`: The maintainer of the component.
+    # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.).
+    roles: [ "author", "maintainer" ]
+    # Additional information on the author
+    info: 
+      github: johndoe
+      orcid: 0000-0000-0000-0000
+      email: john@doe.me
+      twitter: johndoe
+      linkedin: johndoe
+
+# Step 7: Remove all of the comments of the steps you completed
+# Step 8: High five yourself!
+
+config_mods: |
+  .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
+
+repositories:
+  - name: openproblems-v2
+    type: github
+    repo: openproblems-bio/openproblems-v2
+    tag: main_build
\ No newline at end of file
diff --git a/common b/common
new file mode 160000
index 00000000..67ef9612
--- /dev/null
+++ b/common
@@ -0,0 +1 @@
+Subproject commit 67ef9612fce1bbf22e07971e55a9858e8dd2dfa5
diff --git a/main.nf b/main.nf
new file mode 100644
index 00000000..62f01409
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,3 @@
+workflow {
+    print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
+}
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 00000000..8fc6c4e3
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1 @@
+process.container = 'nextflow/bash:latest'
\ No newline at end of file
diff --git a/scripts/.gitignore b/scripts/.gitignore
new file mode 100644
index 00000000..2f7ffd34
--- /dev/null
+++ b/scripts/.gitignore
@@ -0,0 +1,3 @@
+add_a_method.sh
+add_a_control_method.sh
+add_a_metric.sh
\ No newline at end of file
diff --git a/scripts/aggregate_counts.py b/scripts_old/aggregate_counts.py
similarity index 100%
rename from scripts/aggregate_counts.py
rename to scripts_old/aggregate_counts.py
diff --git a/scripts/aggregate_group_metrics.py b/scripts_old/aggregate_group_metrics.py
similarity index 100%
rename from scripts/aggregate_group_metrics.py
rename to scripts_old/aggregate_group_metrics.py
diff --git a/scripts/aggregate_metrics.py b/scripts_old/aggregate_metrics.py
similarity index 100%
rename from scripts/aggregate_metrics.py
rename to scripts_old/aggregate_metrics.py
diff --git a/scripts/aggregate_tile_assignments.py b/scripts_old/aggregate_tile_assignments.py
similarity index 100%
rename from scripts/aggregate_tile_assignments.py
rename to scripts_old/aggregate_tile_assignments.py
diff --git a/scripts/annotate_celltypes.py b/scripts_old/annotate_celltypes.py
similarity index 100%
rename from scripts/annotate_celltypes.py
rename to scripts_old/annotate_celltypes.py
diff --git a/scripts/annotate_celltypes_FRmatch.r b/scripts_old/annotate_celltypes_FRmatch.r
similarity index 100%
rename from scripts/annotate_celltypes_FRmatch.r
rename to scripts_old/annotate_celltypes_FRmatch.r
diff --git a/scripts/annotate_celltypes_consensus_GMCS.py b/scripts_old/annotate_celltypes_consensus_GMCS.py
similarity index 100%
rename from scripts/annotate_celltypes_consensus_GMCS.py
rename to scripts_old/annotate_celltypes_consensus_GMCS.py
diff --git a/scripts/annotate_celltypes_consensus_NWCS.py b/scripts_old/annotate_celltypes_consensus_NWCS.py
similarity index 100%
rename from scripts/annotate_celltypes_consensus_NWCS.py
rename to scripts_old/annotate_celltypes_consensus_NWCS.py
diff --git a/scripts/annotate_celltypes_mapmycells.py b/scripts_old/annotate_celltypes_mapmycells.py
similarity index 100%
rename from scripts/annotate_celltypes_mapmycells.py
rename to scripts_old/annotate_celltypes_mapmycells.py
diff --git a/scripts/annotate_celltypes_mfishtools.r b/scripts_old/annotate_celltypes_mfishtools.r
similarity index 100%
rename from scripts/annotate_celltypes_mfishtools.r
rename to scripts_old/annotate_celltypes_mfishtools.r
diff --git a/scripts/annotate_celltypes_scrattchmapping.r b/scripts_old/annotate_celltypes_scrattchmapping.r
similarity index 100%
rename from scripts/annotate_celltypes_scrattchmapping.r
rename to scripts_old/annotate_celltypes_scrattchmapping.r
diff --git a/scripts/annotate_celltypes_tangram.py b/scripts_old/annotate_celltypes_tangram.py
similarity index 100%
rename from scripts/annotate_celltypes_tangram.py
rename to scripts_old/annotate_celltypes_tangram.py
diff --git a/scripts/annotate_counts.py b/scripts_old/annotate_counts.py
similarity index 100%
rename from scripts/annotate_counts.py
rename to scripts_old/annotate_counts.py
diff --git a/scripts/basic_assignment.py b/scripts_old/basic_assignment.py
similarity index 100%
rename from scripts/basic_assignment.py
rename to scripts_old/basic_assignment.py
diff --git a/scripts/calc_group_metrics.py b/scripts_old/calc_group_metrics.py
similarity index 100%
rename from scripts/calc_group_metrics.py
rename to scripts_old/calc_group_metrics.py
diff --git a/scripts/calc_metrics.py b/scripts_old/calc_metrics.py
similarity index 100%
rename from scripts/calc_metrics.py
rename to scripts_old/calc_metrics.py
diff --git a/scripts/calc_quality_metrics.py b/scripts_old/calc_quality_metrics.py
similarity index 100%
rename from scripts/calc_quality_metrics.py
rename to scripts_old/calc_quality_metrics.py
diff --git a/scripts/copy_celltypes_pciseqct.py b/scripts_old/copy_celltypes_pciseqct.py
similarity index 100%
rename from scripts/copy_celltypes_pciseqct.py
rename to scripts_old/copy_celltypes_pciseqct.py
diff --git a/scripts/gen_counts.py b/scripts_old/gen_counts.py
similarity index 100%
rename from scripts/gen_counts.py
rename to scripts_old/gen_counts.py
diff --git a/scripts/generate_tile.py b/scripts_old/generate_tile.py
similarity index 100%
rename from scripts/generate_tile.py
rename to scripts_old/generate_tile.py
diff --git a/scripts/normalize_sc.py b/scripts_old/normalize_sc.py
similarity index 100%
rename from scripts/normalize_sc.py
rename to scripts_old/normalize_sc.py
diff --git a/scripts/retrieve_tiles_info.py b/scripts_old/retrieve_tiles_info.py
similarity index 100%
rename from scripts/retrieve_tiles_info.py
rename to scripts_old/retrieve_tiles_info.py
diff --git a/scripts/run_baysor.py b/scripts_old/run_baysor.py
similarity index 100%
rename from scripts/run_baysor.py
rename to scripts_old/run_baysor.py
diff --git a/scripts/run_clustermap.py b/scripts_old/run_clustermap.py
similarity index 100%
rename from scripts/run_clustermap.py
rename to scripts_old/run_clustermap.py
diff --git a/scripts/run_mesmer.py b/scripts_old/run_mesmer.py
similarity index 100%
rename from scripts/run_mesmer.py
rename to scripts_old/run_mesmer.py
diff --git a/scripts/run_pciseq.py b/scripts_old/run_pciseq.py
similarity index 100%
rename from scripts/run_pciseq.py
rename to scripts_old/run_pciseq.py
diff --git a/scripts/segment_image.py b/scripts_old/segment_image.py
similarity index 100%
rename from scripts/segment_image.py
rename to scripts_old/segment_image.py
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
new file mode 100644
index 00000000..4d767d8e
--- /dev/null
+++ b/src/api/comp_control_method.yaml
@@ -0,0 +1,37 @@
+namespace: control_methods
+info:
+  type: control_method
+  type_info:
+    label: Control Method
+    summary: Quality control methods for verifying the pipeline.
+    description: |
+      This folder contains control components for the task. 
+      These components have the same interface as the regular methods
+      but also receive the solution object as input. It serves as a
+      starting point to test the relative accuracy of new methods in
+      the task, and also as a quality control for the metrics defined
+      in the task.
+arguments:
+  - name: --input_train
+    __merge__: file_train_h5ad.yaml
+    required: true
+    direction: input
+  - name: --input_test
+    __merge__: file_test_h5ad.yaml
+    required: true
+    direction: input
+  - name: "--input_solution"
+    __merge__: file_solution.yaml
+    direction: input
+    required: true
+  - name: --output
+    __merge__: file_prediction.yaml
+    required: true
+    direction: output
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - type: python_script
+    path: /common/component_tests/check_config.py
+  - path: /resources_test/task_template/pancreas
+    dest: resources_test/task_template/pancreas
\ No newline at end of file
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
new file mode 100644
index 00000000..184bc548
--- /dev/null
+++ b/src/api/comp_data_processor.yaml
@@ -0,0 +1,31 @@
+namespace: "data_processors"
+info:
+  type: data_processor
+  type_info:
+    label: Data processor
+    summary: A data processor.
+    description: |
+      A component for processing a Common Dataset into a task-specific dataset.
+arguments:
+  - name: "--input"
+    __merge__: file_common_dataset.yaml
+    direction: input
+    required: true
+  - name: "--output_train"
+    __merge__: file_train_h5ad.yaml
+    direction: output
+    required: true
+  - name: "--output_test"
+    __merge__: file_test_h5ad.yaml
+    direction: output
+    required: true
+  - name: "--output_solution"
+    __merge__: file_solution.yaml
+    direction: output
+    required: true
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+      
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
new file mode 100644
index 00000000..d7be9578
--- /dev/null
+++ b/src/api/comp_method.yaml
@@ -0,0 +1,28 @@
+namespace: "methods"
+info:
+  type: method
+  type_info:
+    label: Method
+    summary: A method.
+    description: |
+      A method to predict the task effects.
+arguments:
+  - name: --input_train
+    __merge__: file_train_h5ad.yaml
+    required: true
+    direction: input
+  - name: "--input_test"
+    __merge__: file_test_h5ad.yaml
+    direction: input
+    required: true
+  - name: --output
+    __merge__: file_prediction.yaml
+    required: true
+    direction: output
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - type: python_script
+    path: /common/component_tests/check_config.py
+  - path: /resources_test/task_template/pancreas
+    dest: resources_test/task_template/pancreas
\ No newline at end of file
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
new file mode 100644
index 00000000..e3295da0
--- /dev/null
+++ b/src/api/comp_metric.yaml
@@ -0,0 +1,28 @@
+namespace: "metrics"
+info:
+  type: metric
+  type_info:
+    label: Metric
+    summary: A task template metric.
+    description: |
+      A metric for evaluating method predictions.
+arguments:
+  - name: "--input_solution"
+    __merge__: file_solution.yaml
+    direction: input
+    required: true
+  - name: "--input_prediction"
+    __merge__: file_prediction.yaml
+    direction: input
+    required: true
+  - name: "--output"
+    __merge__: file_score.yaml
+    direction: output
+    required: true
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - type: python_script
+    path: /common/component_tests/check_config.py
+  - path: /resources_test/task_template/pancreas
+    dest: resources_test/task_template/pancreas
diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml
new file mode 100644
index 00000000..0927ea0a
--- /dev/null
+++ b/src/api/file_common_dataset.yaml
@@ -0,0 +1,72 @@
+type: file
+example: "resources_test/common/pancreas/dataset.h5ad"
+label: "Common Dataset"
+summary: A subset of the common dataset.
+info:
+  format:
+    type: h5ad
+    layers:
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: double
+        name: normalized
+        description: Normalized expression values
+        required: true
+    obs:
+      - type: string
+        name: cell_type
+        description: Cell type information
+        required: true
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: boolean
+        name: hvg
+        description: Whether or not the feature is considered to be a 'highly variable gene'
+        required: true
+      - type: double
+        name: hvg_score
+        description: A ranking of the features by hvg.
+        required: true
+    obsm:
+      - type: double
+        name: X_pca
+        description: The resulting PCA embedding.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml
new file mode 100644
index 00000000..4a6dc328
--- /dev/null
+++ b/src/api/file_prediction.yaml
@@ -0,0 +1,26 @@
+#TODO: Change to the required and/or optional fields of the anndata
+type: file
+example: "resources_test/task_template/pancreas/prediction.h5ad"
+label: "Predicted data"
+summary: A predicted dataset as output by a method.
+info:
+  format:
+    type: h5ad
+    obs:
+      - type: string
+        name: label_pred
+        description: Predicted labels for the test cells.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
\ No newline at end of file
diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml
new file mode 100644
index 00000000..f6022a83
--- /dev/null
+++ b/src/api/file_score.yaml
@@ -0,0 +1,31 @@
+#TODO: Change to the required and/or optional fields of the anndata
+type: file
+example: resources/score.h5ad
+label: Score
+summary: "File indicating the score of a metric."
+info:
+  format:
+    type: h5ad
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
\ No newline at end of file
diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml
new file mode 100644
index 00000000..81e168e9
--- /dev/null
+++ b/src/api/file_solution.yaml
@@ -0,0 +1,73 @@
+#TODO: Change to the required and/or optional fields of the anndata
+type: file
+example: "resources_test/task_template/pancreas/solution.h5ad"
+label: "Solution"
+summary: "The solution for the test data"
+info:
+  format:
+    type: h5ad
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: double
+        name: normalized
+        description: Normalized counts
+        required: true
+    obs:
+      - type: string
+        name: label
+        description: Ground truth cell type labels
+        required: true
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: boolean
+        name: hvg
+        description: Whether or not the feature is considered to be a 'highly variable gene'
+        required: true
+      - type: double
+        name: hvg_score
+        description: A ranking of the features by hvg.
+        required: true
+    obsm:
+      - type: double
+        name: X_pca
+        description: The resulting PCA embedding.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml
new file mode 100644
index 00000000..6ee21ac5
--- /dev/null
+++ b/src/api/file_test_h5ad.yaml
@@ -0,0 +1,45 @@
+#TODO: Change to the required and/or optional fields of the anndata
+type: file
+example: "resources_test/task_template/pancreas/test.h5ad"
+label: "Test data"
+summary: The subset of molecules used for the test dataset
+info:
+  format:
+    type: h5ad
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: double
+        name: normalized
+        description: Normalized counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: boolean
+        name: hvg
+        description: Whether or not the feature is considered to be a 'highly variable gene'
+        required: true
+      - type: double
+        name: hvg_score
+        description: A ranking of the features by hvg.
+        required: true
+    obsm:
+      - type: double
+        name: X_pca
+        description: The resulting PCA embedding.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
\ No newline at end of file
diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_train_h5ad.yaml
new file mode 100644
index 00000000..7d2b51d5
--- /dev/null
+++ b/src/api/file_train_h5ad.yaml
@@ -0,0 +1,49 @@
+#TODO: Change to the required and/or optional fields of the anndata
+type: file
+example: "resources_test/task_template/pancreas/train.h5ad"
+label: "Training data"
+summary: "The training data in h5ad format"
+info:
+  format:
+    type: h5ad
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: double
+        name: normalized
+        description: Normalized counts
+        required: true
+    obs:
+      - type: string
+        name: label
+        description: Ground truth cell type labels
+        required: true
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: boolean
+        name: hvg
+        description: Whether or not the feature is considered to be a 'highly variable gene'
+        required: true
+      - type: double
+        name: hvg_score
+        description: A ranking of the features by hvg.
+        required: true
+    obsm:
+      - type: double
+        name: X_pca
+        description: The resulting PCA embedding.
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
\ No newline at end of file
diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml
new file mode 100644
index 00000000..741e3f25
--- /dev/null
+++ b/src/control_methods/true_labels/config.vsh.yaml
@@ -0,0 +1,59 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_control_method.yaml
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: true_labels
+
+# A relatively short label, used when rendering visualisations (required)
+label: True Labels
+# A one sentence summary of how this method works (required). Used when 
+# rendering summary tables.
+summary: "a positive control, solution labels are copied 1 to 1 to the predicted data."
+# A multi-line description of how this component works (required). Used
+# when rendering reference documentation.
+description: |
+  A positive control, where the solution labels are copied 1 to 1 to the predicted data.
+
+# Metadata for your component
+info:
+  # Which normalisation method this component prefers to use (required).
+  preferred_normalization: counts
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: python_script
+    path: script.py
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    # setup:
+    #   - type: python
+    #     packages: scib==1.1.5
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime, lowmem, lowcpu]
diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py
new file mode 100644
index 00000000..0a04aaf4
--- /dev/null
+++ b/src/control_methods/true_labels/script.py
@@ -0,0 +1,45 @@
+import anndata as ad
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_train': 'resources_test/task_template/pancreas/train.h5ad',
+  'input_test': 'resources_test/task_template/pancreas/test.h5ad',
+  'input_solution': 'resources_test/task_template/pancreas/solution.h5ad',
+  'output': 'output.h5ad'
+}
+meta = {
+  'name': 'true_labels'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+input_train = ad.read_h5ad(par['input_train'])
+input_test = ad.read_h5ad(par['input_test'])
+input_solution = ad.read_h5ad(par['input_solution'])
+
+print('Preprocess data', flush=True)
+# ... preprocessing ...
+
+print('Train model', flush=True)
+# ... train model ...
+
+print('Generate predictions', flush=True)
+# ... generate predictions ...
+obs_label_pred = input_solution.obs["label"]
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+  uns={
+    'dataset_id': input_train.uns['dataset_id'],
+    'normalization_id': input_train.uns['normalization_id'],
+    'method_id': meta['name']
+  },
+  obs={
+    'label_pred': obs_label_pred
+  }
+)
+output.obs_names = input_test.obs_names
+
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
new file mode 100644
index 00000000..a9977208
--- /dev/null
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_data_processor.yaml
+name: process_dataset
+arguments:
+  - name: "--method"
+    type: "string"
+    description: "The process method to assign train/test."
+    choices: ["batch", "random"]
+    default: "batch"
+  - name: "--obs_label"
+    type: "string"
+    description: "Which .obs slot to use as label."
+    default: "cell_type"
+  - name: "--obs_batch"
+    type: "string"
+    description: "Which .obs slot to use as batch covariate."
+    default: "batch"
+  - name: "--seed"
+    type: "integer"
+    description: "A seed for the subsampling."
+    example: 123
+resources:
+  - type: python_script
+    path: script.py
+  - path: /common/helper_functions/subset_h5ad_by_format.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
\ No newline at end of file
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
new file mode 100644
index 00000000..350d5564
--- /dev/null
+++ b/src/data_processors/process_dataset/script.py
@@ -0,0 +1,86 @@
+import sys
+import random
+import numpy as np
+import anndata as ad
+import openproblems as op
+
+## VIASH START
+par = {
+    'input': 'resources_test/common/pancreas/dataset.h5ad',
+    'method': 'batch',
+    'seed': None,
+    'obs_batch': 'batch',
+    'obs_label': 'cell_type',
+    'output_train': 'train.h5ad',
+    'output_test': 'test.h5ad',
+    'output_solution': 'solution.h5ad'
+}
+meta = {
+    'resources_dir': 'target/executable/data_processors/process_dataset',
+    'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml'
+}
+## VIASH END
+
+# import helper functions
+sys.path.append(meta['resources_dir'])
+from subset_h5ad_by_format import subset_h5ad_by_format
+
+config = op.project.read_viash_config(meta["config"])
+
+# set seed if need be
+if par["seed"]:
+    print(f">> Setting seed to {par['seed']}")
+    random.seed(par["seed"])
+
+print(">> Load data", flush=True)
+adata = ad.read_h5ad(par["input"])
+print("input:", adata)
+
+print(f">> Process data using {par['method']} method")
+if par["method"] == "batch":
+    batch_info = adata.obs[par["obs_batch"]]
+    batch_categories = batch_info.dtype.categories
+    test_batches = random.sample(list(batch_categories), 1)
+    is_test = [ x in test_batches for x in batch_info ]
+elif par["method"] == "random":
+    train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False)
+    is_test = [ not x in train_ix for x in range(0, adata.n_obs) ]
+
+# subset the different adatas
+print(">> Figuring which data needs to be copied to which output file", flush=True)
+# use par arguments to look for label and batch value in different slots
+slot_mapping = {
+    "obs": {
+        "label": par["obs_label"],
+        "batch": par["obs_batch"],
+    }
+}
+
+print(">> Creating train data", flush=True)
+output_train = subset_h5ad_by_format(
+    adata[[not x for x in is_test]],
+    config,
+    "output_train",
+    slot_mapping
+)
+
+print(">> Creating test data", flush=True)
+output_test = subset_h5ad_by_format(
+    adata[is_test],
+    config,
+    "output_test",
+    slot_mapping
+)
+
+print(">> Creating solution data", flush=True)
+output_solution = subset_h5ad_by_format(
+    adata[is_test],
+    config,
+    "output_solution",
+    slot_mapping
+)
+
+print(">> Writing data", flush=True)
+output_train.write_h5ad(par["output_train"])
+output_test.write_h5ad(par["output_test"])
+output_solution.write_h5ad(par["output_solution"])
diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml
new file mode 100644
index 00000000..11109357
--- /dev/null
+++ b/src/methods/logistic_regression/config.vsh.yaml
@@ -0,0 +1,77 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_method.yaml
+
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: logistic_regression
+# A relatively short label, used when rendering visualisations (required)
+label: Logistic Regression
+# A one sentence summary of how this method works (required). Used when 
+# rendering summary tables.
+summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes."
+# A multi-line description of how this component works (required). Used
+# when rendering reference documentation.
+description: |
+  Logistic Regression estimates parameters of a logistic function for
+  multivariate classification tasks. Here, we use 100-dimensional whitened PCA
+  coordinates as independent variables, and the model minimises the cross
+  entropy loss over all cell type classes.
+# Metadata for your component
+# A reference key from the bibtex library at src/common/library.bib (required).
+references:
+  bibtex: 
+    - |
+      @book{hosmer2013applied,
+        title = {Applied logistic regression},
+        author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.},
+        year = {2013},
+        publisher = {John Wiley \& Sons},
+        volume = {398}
+      }
+info:
+
+  # Which normalisation method this component prefers to use (required).
+  preferred_normalization: log_cp10k
+  # URL to the documentation for this method (required).
+  documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html"
+  # URL to the code repository for this method (required).
+  repository_url: https://github.com/scikit-learn/scikit-learn
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: python_script
+    path: script.py
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: python
+        packages: scikit-learn
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py
new file mode 100644
index 00000000..cc851f8e
--- /dev/null
+++ b/src/methods/logistic_regression/script.py
@@ -0,0 +1,46 @@
+import anndata as ad
+import sklearn.linear_model
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_train': 'resources_test/task_template/pancreas/train.h5ad',
+  'input_test': 'resources_test/task_template/pancreas/test.h5ad',
+  'output': 'output.h5ad'
+}
+meta = {
+  'name': 'logistic_regression'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+input_train = ad.read_h5ad(par['input_train'])
+input_test = ad.read_h5ad(par['input_test'])
+
+print('Preprocess data', flush=True)
+# ... preprocessing ...
+
+print('Train model', flush=True)
+# ... train model ...
+classifier = sklearn.linear_model.LogisticRegression()
+classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str))
+
+print('Generate predictions', flush=True)
+# ... generate predictions ...
+obs_label_pred = classifier.predict(input_test.obsm["X_pca"])
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+  uns={
+    'dataset_id': input_train.uns['dataset_id'],
+    'normalization_id': input_train.uns['normalization_id'],
+    'method_id': meta['name']
+  },
+  obs={
+    'label_pred': obs_label_pred
+  }
+)
+output.obs_names = input_test.obs_names
+
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml
new file mode 100644
index 00000000..66fa8359
--- /dev/null
+++ b/src/metrics/accuracy/config.vsh.yaml
@@ -0,0 +1,70 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_metric.yaml
+
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: accuracy
+
+# Metadata for your component
+info:
+  metrics:
+    # A unique identifier for your metric (required).
+    # Can contain only lowercase letters or underscores.
+    - name: accuracy
+      # A relatively short label, used when rendering visualisarions (required)
+      label: Accuracy
+      # A one sentence summary of how this metric works (required). Used when 
+      # rendering summary tables.
+      summary: "The percentage of correctly predicted labels."
+      # A multi-line description of how this component works (required). Used
+      # when rendering reference documentation.
+      description: |
+        The percentage of correctly predicted labels.
+      # A reference key from the bibtex library at src/common/library.bib (required).
+      references:
+        doi: 10.48550/arXiv.2008.05756
+      # The minimum possible value for this metric (required)
+      min: 0
+      # The maximum possible value for this metric (required)
+      max: 1
+      # Whether a higher value represents a 'better' solution (required)
+      maximize: true
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: python_script
+    path: script.py
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: python
+        packages: scikit-learn
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu]
diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py
new file mode 100644
index 00000000..72dcb1e5
--- /dev/null
+++ b/src/metrics/accuracy/script.py
@@ -0,0 +1,47 @@
+import anndata as ad
+import numpy as np
+import sklearn.preprocessing
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_solution': 'resources_test/task_template/pancreas/solution.h5ad',
+  'input_prediction': 'resources_test/task_template/pancreas/prediction.h5ad',
+  'output': 'output.h5ad'
+}
+meta = {
+  'name': 'accuracy'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+input_solution = ad.read_h5ad(par['input_solution'])
+input_prediction = ad.read_h5ad(par['input_prediction'])
+
+assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs"
+
+print("Encode labels", flush=True)
+cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories)
+encoder = sklearn.preprocessing.LabelEncoder().fit(cats)
+input_solution.obs["label"] = encoder.transform(input_solution.obs["label"])
+input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"])
+
+
+print('Compute metrics', flush=True)
+# metric_ids and metric_values can have length > 1
+# but should be of equal length
+uns_metric_ids = [ 'accuracy' ]
+uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"])
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+  uns={
+    'dataset_id': input_prediction.uns['dataset_id'],
+    'normalization_id': input_prediction.uns['normalization_id'],
+    'method_id': input_prediction.uns['method_id'],
+    'metric_ids': uns_metric_ids,
+    'metric_values': uns_metric_values
+  }
+)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
new file mode 100644
index 00000000..71c1b9be
--- /dev/null
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -0,0 +1,40 @@
+name: process_datasets
+namespace: workflows
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        __merge__: /src/api/file_common_dataset.yaml
+        required: true
+        direction: input
+  - name: Outputs
+    arguments:
+      - name: "--output_train"
+        __merge__: /src/api/file_train_h5ad.yaml
+        required: true
+        direction: output
+      - name: "--output_test"
+        __merge__: /src/api/file_test_h5ad.yaml
+        required: true
+        direction: output
+      - name: "--output_solution"
+        __merge__: /src/api/file_solution.yaml
+        required: true
+        direction: output
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+
+dependencies:
+  - name: common/check_dataset_schema
+    repository: openproblems-v2
+  - name: common/extract_metadata
+    repository: openproblems-v2
+  - name: data_processors/process_dataset
+
+runners:
+  - type: nextflow
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
new file mode 100644
index 00000000..eae19f7c
--- /dev/null
+++ b/src/workflows/process_datasets/main.nf
@@ -0,0 +1,173 @@
+include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"
+
+workflow auto {
+  findStatesTemp(params, meta.config)
+    | meta.workflow.run(
+      auto: [publish: "state"]
+    )
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    | check_dataset_schema.run(
+      fromState: { id, state ->
+        def schema = findArgumentSchema(meta.config, "input")
+        def schemaYaml = tempFile("schema.yaml")
+        writeYaml(schema, schemaYaml)
+        [
+          "input": state.input,
+          "schema": schemaYaml
+        ]
+      },
+      toState: { id, output, state ->
+        // read the output to see if dataset passed the qc
+        def checks = readYaml(output.output)
+        state + [
+          "dataset": checks["exit_code"] == 0 ? state.input : null,
+        ]
+      }
+    )
+
+    // remove datasets which didn't pass the schema check
+    | filter { id, state ->
+      state.dataset != null
+    }
+
+    | process_dataset.run(
+      fromState: [ input: "dataset" ],
+      toState: [
+        output_train: "output_train",
+        output_test: "output_test",
+        output_solution: "output_solution"
+      ]
+    )
+
+    // only output the files for which an output file was specified
+    | setState(["output_train", "output_test", "output_solution"])
+
+  emit:
+  output_ch
+}
+
+
+// temp fix for rename_keys typo
+
+def findStatesTemp(Map params, Map config) {
+  def auto_config = deepClone(config)
+  def auto_params = deepClone(params)
+
+  auto_config = auto_config.clone()
+  // override arguments
+  auto_config.argument_groups = []
+  auto_config.arguments = [
+    [
+      type: "string",
+      name: "--id",
+      description: "A dummy identifier",
+      required: false
+    ],
+    [
+      type: "file",
+      name: "--input_states",
+      example: "/path/to/input/directory/**/state.yaml",
+      description: "Path to input directory containing the datasets to be integrated.",
+      required: true,
+      multiple: true,
+      multiple_sep: ";"
+    ],
+    [
+      type: "string",
+      name: "--filter",
+      example: "foo/.*/state.yaml",
+      description: "Regex to filter state files by path.",
+      required: false
+    ],
+    // to do: make this a yaml blob?
+    [
+      type: "string",
+      name: "--rename_keys",
+      example: ["newKey1:oldKey1", "newKey2:oldKey2"],
+      description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.",
+      required: false,
+      multiple: true,
+      multiple_sep: ";"
+    ],
+    [
+      type: "string",
+      name: "--settings",
+      example: '{"output_dataset": "dataset.h5ad", "k": 10}',
+      description: "Global arguments as a JSON glob to be passed to all components.",
+      required: false
+    ]
+  ]
+  if (!(auto_params.containsKey("id"))) {
+    auto_params["id"] = "auto"
+  }
+
+  // run auto config through processConfig once more
+  auto_config = processConfig(auto_config)
+
+  workflow findStatesTempWf {
+    helpMessage(auto_config)
+
+    output_ch = 
+      channelFromParams(auto_params, auto_config)
+        | flatMap { autoId, args ->
+
+          def globalSettings = args.settings ? readYamlBlob(args.settings) : [:]
+
+          // look for state files in input dir
+          def stateFiles = args.input_states
+
+          // filter state files by regex
+          if (args.filter) {
+            stateFiles = stateFiles.findAll{ stateFile ->
+              def stateFileStr = stateFile.toString()
+              def matcher = stateFileStr =~ args.filter
+              matcher.matches()}
+          }
+
+          // read in states
+          def states = stateFiles.collect { stateFile ->
+            def state_ = readTaggedYaml(stateFile)
+            [state_.id, state_]
+          }
+
+          // construct renameMap
+          if (args.rename_keys) {
+            def renameMap = args.rename_keys.collectEntries{renameString ->
+              def split = renameString.split(":")
+              assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'"
+              split
+            }
+
+            // rename keys in state, only let states through which have all keys
+            // also add global settings
+            states = states.collectMany{id, state ->
+              def newState = [:]
+
+              for (key in renameMap.keySet()) {
+                def origKey = renameMap[key]
+                if (!(state.containsKey(origKey))) {
+                  return []
+                }
+                newState[key] = state[origKey]
+              }
+
+              [[id, globalSettings + newState]]
+            }
+          }
+
+          states
+        }
+    emit:
+    output_ch
+  }
+
+  return findStatesTempWf
+}
\ No newline at end of file
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
new file mode 100644
index 00000000..12a4602c
--- /dev/null
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -0,0 +1,74 @@
+name: run_benchmark
+namespace: workflows
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input_train"
+        __merge__: /src/api/file_train_h5ad.yaml
+        type: file
+        direction: input
+        required: true
+      - name: "--input_test"
+        __merge__: /src/api/file_test_h5ad.yaml
+        type: file
+        direction: input
+        required: true
+      - name: "--input_solution"
+        __merge__: /src/api/file_solution.yaml
+        type: file
+        direction: input
+        required: true
+  - name: Outputs
+    arguments:
+      - name: "--output_scores"
+        type: file
+        required: true
+        direction: output
+        description: A yaml file containing the scores of each of the methods
+        default: score_uns.yaml
+      - name: "--output_method_configs"
+        type: file
+        required: true
+        direction: output
+        default: method_configs.yaml
+      - name: "--output_metric_configs"
+        type: file
+        required: true
+        direction: output
+        default: metric_configs.yaml
+      - name: "--output_dataset_info"
+        type: file
+        required: true
+        direction: output
+        default: dataset_uns.yaml
+      - name: "--output_task_info"
+        type: file
+        required: true
+        direction: output
+        default: task_info.yaml
+  - name: Methods
+    arguments:
+      - name: "--method_ids"
+        type: string
+        multiple: true
+        description: A list of method ids to run. If not specified, all methods will be run.
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - type: file
+    path: /_viash.yaml
+
+dependencies:
+  - name: common/check_dataset_schema
+    repository: openproblems-v2
+  - name: common/extract_metadata
+    repository: openproblems-v2
+  - name: control_methods/true_labels
+  - name: methods/logistic_regression
+  - name: metrics/accuracy
+
+runners:
+  - type: nextflow
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
new file mode 100644
index 00000000..68e5ecd3
--- /dev/null
+++ b/src/workflows/run_benchmark/main.nf
@@ -0,0 +1,311 @@
+workflow auto {
+  findStatesTemp(params, meta.config)
+    | meta.workflow.run(
+      auto: [publish: "state"]
+    )
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+
+  // construct list of methods
+  methods = [
+    true_labels,
+    logistic_regression
+  ]
+
+  // construct list of metrics
+  metrics = [
+    accuracy
+  ]
+
+  /****************************
+   * EXTRACT DATASET METADATA *
+   ****************************/
+  dataset_ch = input_ch
+    // store join id
+    | map{ id, state -> 
+      [id, state + ["_meta": [join_id: id]]]
+    }
+
+    // extract the dataset metadata
+    | extract_metadata.run(
+      fromState: [input: "input_solution"],
+      toState: { id, output, state ->
+        state + [
+          dataset_uns: readYaml(output.output).uns
+        ]
+      }
+    )
+
+  /***************************
+   * RUN METHODS AND METRICS *
+   ***************************/
+  score_ch = dataset_ch
+
+    // run all methods
+    | runEach(
+      components: methods,
+
+      // use the 'filter' argument to only run a method on the normalisation the component is asking for
+      filter: { id, state, comp ->
+        def norm = state.dataset_uns.normalization_id
+        def pref = comp.config.info.preferred_normalization
+        // if the preferred normalisation is none at all,
+        // we can pass whichever dataset we want
+        def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref
+        def method_check = !state.method_ids || state.method_ids.contains(comp.config.name)
+
+        method_check && norm_check
+      },
+
+      // define a new 'id' by appending the method name to the dataset id
+      id: { id, state, comp ->
+        id + "." + comp.config.name
+      },
+
+      // use 'fromState' to fetch the arguments the component requires from the overall state
+      fromState: { id, state, comp ->
+        def new_args = [
+          input_train: state.input_train,
+          input_test: state.input_test
+        ]
+        if (comp.config.info.type == "control_method") {
+          new_args.input_solution = state.input_solution
+        }
+        new_args
+      },
+
+      // use 'toState' to publish that component's outputs to the overall state
+      toState: { id, output, state, comp ->
+        state + [
+          method_id: comp.config.name,
+          method_output: output.output
+        ]
+      }
+    )
+
+    // run all metrics
+    | runEach(
+      components: metrics,
+      id: { id, state, comp ->
+        id + "." + comp.config.name
+      },
+      // use 'fromState' to fetch the arguments the component requires from the overall state
+      fromState: [
+        input_solution: "input_solution", 
+        input_prediction: "method_output"
+      ],
+      // use 'toState' to publish that component's outputs to the overall state
+      toState: { id, output, state, comp ->
+        state + [
+          metric_id: comp.config.name,
+          metric_output: output.output
+        ]
+      }
+    )
+
+
+  /******************************
+   * GENERATE OUTPUT YAML FILES *
+   ******************************/
+  // TODO: can we store everything below in a separate helper function?
+
+  // extract the dataset metadata
+  dataset_meta_ch = dataset_ch
+    // only keep one of the normalization methods
+    | filter{ id, state ->
+      state.dataset_uns.normalization_id == "log_cp10k"
+    }
+    | joinStates { ids, states ->
+      // store the dataset metadata in a file
+      def dataset_uns = states.collect{state ->
+        def uns = state.dataset_uns.clone()
+        uns.remove("normalization_id")
+        uns
+      }
+      def dataset_uns_yaml_blob = toYamlBlob(dataset_uns)
+      def dataset_uns_file = tempFile("dataset_uns.yaml")
+      dataset_uns_file.write(dataset_uns_yaml_blob)
+
+      ["output", [output_dataset_info: dataset_uns_file]]
+    }
+
+  output_ch = score_ch
+
+    // extract the scores
+    | extract_metadata.run(
+      key: "extract_scores",
+      fromState: [input: "metric_output"],
+      toState: { id, output, state ->
+        state + [
+          score_uns: readYaml(output.output).uns
+        ]
+      }
+    )
+
+    | joinStates { ids, states ->
+      // store the method configs in a file
+      def method_configs = methods.collect{it.config}
+      def method_configs_yaml_blob = toYamlBlob(method_configs)
+      def method_configs_file = tempFile("method_configs.yaml")
+      method_configs_file.write(method_configs_yaml_blob)
+
+      // store the metric configs in a file
+      def metric_configs = metrics.collect{it.config}
+      def metric_configs_yaml_blob = toYamlBlob(metric_configs)
+      def metric_configs_file = tempFile("metric_configs.yaml")
+      metric_configs_file.write(metric_configs_yaml_blob)
+
+      def viash_file = meta.resources_dir.resolve("_viash.yaml")
+      def viash_file_content = toYamlBlob(readYaml(viash_file).info)
+      def task_info_file = tempFile("task_info.yaml")
+      task_info_file.write(viash_file_content)
+
+      // store the scores in a file
+      def score_uns = states.collect{it.score_uns}
+      def score_uns_yaml_blob = toYamlBlob(score_uns)
+      def score_uns_file = tempFile("score_uns.yaml")
+      score_uns_file.write(score_uns_yaml_blob)
+
+      def new_state = [
+        output_method_configs: method_configs_file,
+        output_metric_configs: metric_configs_file,
+        output_task_info: task_info_file,
+        output_scores: score_uns_file,
+        _meta: states[0]._meta
+      ]
+
+      ["output", new_state]
+    }
+
+    // merge all of the output data 
+    | mix(dataset_meta_ch)
+    | joinStates{ ids, states ->
+      def mergedStates = states.inject([:]) { acc, m -> acc + m }
+      [ids[0], mergedStates]
+    }
+
+  emit:
+  output_ch
+}
+
+// temp fix for rename_keys typo
+
+def findStatesTemp(Map params, Map config) {
+  def auto_config = deepClone(config)
+  def auto_params = deepClone(params)
+
+  auto_config = auto_config.clone()
+  // override arguments
+  auto_config.argument_groups = []
+  auto_config.arguments = [
+    [
+      type: "string",
+      name: "--id",
+      description: "A dummy identifier",
+      required: false
+    ],
+    [
+      type: "file",
+      name: "--input_states",
+      example: "/path/to/input/directory/**/state.yaml",
+      description: "Path to input directory containing the datasets to be integrated.",
+      required: true,
+      multiple: true,
+      multiple_sep: ";"
+    ],
+    [
+      type: "string",
+      name: "--filter",
+      example: "foo/.*/state.yaml",
+      description: "Regex to filter state files by path.",
+      required: false
+    ],
+    // to do: make this a yaml blob?
+    [
+      type: "string",
+      name: "--rename_keys",
+      example: ["newKey1:oldKey1", "newKey2:oldKey2"],
+      description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.",
+      required: false,
+      multiple: true,
+      multiple_sep: ";"
+    ],
+    [
+      type: "string",
+      name: "--settings",
+      example: '{"output_dataset": "dataset.h5ad", "k": 10}',
+      description: "Global arguments as a JSON glob to be passed to all components.",
+      required: false
+    ]
+  ]
+  if (!(auto_params.containsKey("id"))) {
+    auto_params["id"] = "auto"
+  }
+
+  // run auto config through processConfig once more
+  auto_config = processConfig(auto_config)
+
+  workflow findStatesTempWf {
+    helpMessage(auto_config)
+
+    output_ch = 
+      channelFromParams(auto_params, auto_config)
+        | flatMap { autoId, args ->
+
+          def globalSettings = args.settings ? readYamlBlob(args.settings) : [:]
+
+          // look for state files in input dir
+          def stateFiles = args.input_states
+
+          // filter state files by regex
+          if (args.filter) {
+            stateFiles = stateFiles.findAll{ stateFile ->
+              def stateFileStr = stateFile.toString()
+              def matcher = stateFileStr =~ args.filter
+              matcher.matches()}
+          }
+
+          // read in states
+          def states = stateFiles.collect { stateFile ->
+            def state_ = readTaggedYaml(stateFile)
+            [state_.id, state_]
+          }
+
+          // construct renameMap
+          if (args.rename_keys) {
+            def renameMap = args.rename_keys.collectEntries{renameString ->
+              def split = renameString.split(":")
+              assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'"
+              split
+            }
+
+            // rename keys in state, only let states through which have all keys
+            // also add global settings
+            states = states.collectMany{id, state ->
+              def newState = [:]
+
+              for (key in renameMap.keySet()) {
+                def origKey = renameMap[key]
+                if (!(state.containsKey(origKey))) {
+                  return []
+                }
+                newState[key] = state[origKey]
+              }
+
+              [[id, globalSettings + newState]]
+            }
+          }
+
+          states
+        }
+    emit:
+    output_ch
+  }
+
+  return findStatesTempWf
+}
\ No newline at end of file