From d11e85087744738f34465c3bcf91910e521f5539 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 26 Aug 2024 15:50:39 +0200 Subject: [PATCH] add viash template --- .github/ISSUE_TEMPLATE/bug_report.md | 24 ++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/ISSUE_TEMPLATE/feature_request.md | 20 ++ .github/PULL_REQUEST_TEMPLATE.md | 17 + .github/workflows/test.yaml | 11 + .gitignore | 4 + .gitmodules | 3 + CHANGELOG.md | 26 ++ LICENSE | 21 ++ _viash.yaml | 87 +++++ common | 1 + main.nf | 3 + nextflow.config | 1 + scripts/.gitignore | 3 + {scripts => scripts_old}/aggregate_counts.py | 0 .../aggregate_group_metrics.py | 0 {scripts => scripts_old}/aggregate_metrics.py | 0 .../aggregate_tile_assignments.py | 0 .../annotate_celltypes.py | 0 .../annotate_celltypes_FRmatch.r | 0 .../annotate_celltypes_consensus_GMCS.py | 0 .../annotate_celltypes_consensus_NWCS.py | 0 .../annotate_celltypes_mapmycells.py | 0 .../annotate_celltypes_mfishtools.r | 0 .../annotate_celltypes_scrattchmapping.r | 0 .../annotate_celltypes_tangram.py | 0 {scripts => scripts_old}/annotate_counts.py | 0 {scripts => scripts_old}/basic_assignment.py | 0 .../calc_group_metrics.py | 0 {scripts => scripts_old}/calc_metrics.py | 0 .../calc_quality_metrics.py | 0 .../copy_celltypes_pciseqct.py | 0 {scripts => scripts_old}/gen_counts.py | 0 {scripts => scripts_old}/generate_tile.py | 0 {scripts => scripts_old}/normalize_sc.py | 0 .../retrieve_tiles_info.py | 0 {scripts => scripts_old}/run_baysor.py | 0 {scripts => scripts_old}/run_clustermap.py | 0 {scripts => scripts_old}/run_mesmer.py | 0 {scripts => scripts_old}/run_pciseq.py | 0 {scripts => scripts_old}/segment_image.py | 0 src/api/comp_control_method.yaml | 37 +++ src/api/comp_data_processor.yaml | 31 ++ src/api/comp_method.yaml | 28 ++ src/api/comp_metric.yaml | 28 ++ src/api/file_common_dataset.yaml | 72 ++++ src/api/file_prediction.yaml | 26 ++ src/api/file_score.yaml | 31 ++ src/api/file_solution.yaml | 73 ++++ src/api/file_test_h5ad.yaml | 45 +++ src/api/file_train_h5ad.yaml | 49 +++ .../true_labels/config.vsh.yaml | 59 ++++ src/control_methods/true_labels/script.py | 45 +++ .../process_dataset/config.vsh.yaml | 34 ++ src/data_processors/process_dataset/script.py | 86 +++++ .../logistic_regression/config.vsh.yaml | 77 +++++ src/methods/logistic_regression/script.py | 46 +++ src/metrics/accuracy/config.vsh.yaml | 70 ++++ src/metrics/accuracy/script.py | 47 +++ .../process_datasets/config.vsh.yaml | 40 +++ src/workflows/process_datasets/main.nf | 173 ++++++++++ src/workflows/run_benchmark/config.vsh.yaml | 74 +++++ src/workflows/run_benchmark/main.nf | 311 ++++++++++++++++++ 63 files changed, 1704 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/test.yaml create mode 100644 .gitmodules create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 _viash.yaml create mode 160000 common create mode 100644 main.nf create mode 100644 nextflow.config create mode 100644 scripts/.gitignore rename {scripts => scripts_old}/aggregate_counts.py (100%) rename {scripts => scripts_old}/aggregate_group_metrics.py (100%) rename {scripts => scripts_old}/aggregate_metrics.py (100%) rename {scripts => scripts_old}/aggregate_tile_assignments.py (100%) rename {scripts => scripts_old}/annotate_celltypes.py (100%) rename {scripts => scripts_old}/annotate_celltypes_FRmatch.r (100%) rename {scripts => scripts_old}/annotate_celltypes_consensus_GMCS.py (100%) rename {scripts => scripts_old}/annotate_celltypes_consensus_NWCS.py (100%) rename {scripts => scripts_old}/annotate_celltypes_mapmycells.py (100%) rename {scripts => scripts_old}/annotate_celltypes_mfishtools.r (100%) rename {scripts => scripts_old}/annotate_celltypes_scrattchmapping.r (100%) rename {scripts => scripts_old}/annotate_celltypes_tangram.py (100%) rename {scripts => scripts_old}/annotate_counts.py (100%) rename {scripts => scripts_old}/basic_assignment.py (100%) rename {scripts => scripts_old}/calc_group_metrics.py (100%) rename {scripts => scripts_old}/calc_metrics.py (100%) rename {scripts => scripts_old}/calc_quality_metrics.py (100%) rename {scripts => scripts_old}/copy_celltypes_pciseqct.py (100%) rename {scripts => scripts_old}/gen_counts.py (100%) rename {scripts => scripts_old}/generate_tile.py (100%) rename {scripts => scripts_old}/normalize_sc.py (100%) rename {scripts => scripts_old}/retrieve_tiles_info.py (100%) rename {scripts => scripts_old}/run_baysor.py (100%) rename {scripts => scripts_old}/run_clustermap.py (100%) rename {scripts => scripts_old}/run_mesmer.py (100%) rename {scripts => scripts_old}/run_pciseq.py (100%) rename {scripts => scripts_old}/segment_image.py (100%) create mode 100644 src/api/comp_control_method.yaml create mode 100644 src/api/comp_data_processor.yaml create mode 100644 src/api/comp_method.yaml create mode 100644 src/api/comp_metric.yaml create mode 100644 src/api/file_common_dataset.yaml create mode 100644 src/api/file_prediction.yaml create mode 100644 src/api/file_score.yaml create mode 100644 src/api/file_solution.yaml create mode 100644 src/api/file_test_h5ad.yaml create mode 100644 src/api/file_train_h5ad.yaml create mode 100644 src/control_methods/true_labels/config.vsh.yaml create mode 100644 src/control_methods/true_labels/script.py create mode 100644 src/data_processors/process_dataset/config.vsh.yaml create mode 100644 src/data_processors/process_dataset/script.py create mode 100644 src/methods/logistic_regression/config.vsh.yaml create mode 100644 src/methods/logistic_regression/script.py create mode 100644 src/metrics/accuracy/config.vsh.yaml create mode 100644 src/metrics/accuracy/script.py create mode 100644 src/workflows/process_datasets/config.vsh.yaml create mode 100644 src/workflows/process_datasets/main.nf create mode 100644 src/workflows/run_benchmark/config.vsh.yaml create mode 100644 src/workflows/run_benchmark/main.nf diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..9a8a64b4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: [bug] +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..a49eab2f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..c17d3c0d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: [enhancement] +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..37171375 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,17 @@ +## Describe your changes + + + +## Checklist before requesting a review +- [ ] I have performed a self-review of my code + +- Check the correct box. Does this PR contain: + - [ ] Breaking changes + - [ ] New functionality + - [ ] Major changes + - [ ] Minor changes + - [ ] Bug fixes + +- [ ] Proposed changes are described in the CHANGELOG.md + +- [ ] CI Tests succeed and look good! \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 00000000..7194c6ea --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,11 @@ +name: Test + +on: + push: + branches: + - main + pull_request: + +jobs: + build: + uses: viash-io/viash-actions/.github/workflows/test.yaml@v6 diff --git a/.gitignore b/.gitignore index dac9fd8a..d86eb196 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,7 @@ temp/ dependency.png envs/src/ singularity_container/ + +target +.nextflow +.resources_test \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..d35f6d23 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "common"] + path = common + url = git@github.com:openproblems-bio/common_resources.git diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..38397448 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,26 @@ +# task_template x.y.z + +## BREAKING CHANGES + + + +## NEW FUNCTIONALITY + +* Added `control_methods/true_labels` component (PR #5). + +* Added `methods/logistic_regression` component (PR #5). + +* Added `metrics/accuracy` component (PR #5). + +## MAJOR CHANGES + +* Updated `api` files (PR #5). + +* Updated configs, components and CI to the latest Viash version (PR #8). + +## MINOR CHANGES + +* Updated `README.md` (PR #5). + +## BUGFIXES + diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..3a85904e --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Open Problems in Single-Cell Analysis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/_viash.yaml b/_viash.yaml new file mode 100644 index 00000000..c4d0ca05 --- /dev/null +++ b/_viash.yaml @@ -0,0 +1,87 @@ +viash_version: 0.9.0-RC7 + +# Step 1: Change the name of the task. +# example: task_name_of_this_task +name: task_preprocessing_imagingbased_st +organization: openproblems-bio +version: dev + +license: MIT +# Step 2: Add keywords to describe the task. +keywords: [single-cell, openproblems, benchmark] +# Step 3: Update the `task_template` to the name of the task from step 1. +links: + issue_tracker: https://github.com/openproblems-bio/task_preprocessing_imagingbased_st/issues + repository: https://github.com/openproblems-bio/task_preprocessing_imagingbased_st + docker_registry: ghcr.io + + +# Step 4: Update the label, summary and description. +# A unique, human-readable, short label. Used for creating summary tables and visualisations. +label: Template +summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. +description: | + Provide a clear and concise description of your task, detailing the specific problem it aims + to solve. Outline the input data types, the expected output, and any assumptions or constraints. + Be sure to explain any terminology or concepts that are essential for understanding the task. + + Explain the motivation behind your proposed task. Describe the biological or computational + problem you aim to address and why it's important. Discuss the current state of research in + this area and any gaps or challenges that your task could help address. This section + should convince readers of the significance and relevance of your task. + +# A list of references to relevant literature. Each reference should be a DOI or a bibtex entry +references: + doi: + - 10.21203/rs.3.rs-4181617/v1 + # bibtex: + # - | + # @article{doe_2021_template, + # doi = {10.21203/rs.3.rs-4181617/v1}, + # url = {https://doi.org/10.21203/rs.3.rs-4181617/v1}, + # author = {Doe, John}, + # title = {A template for creating new tasks}, + # publisher = {Research Square}, + # year = {2021}, + # } + +info: + image: The name of the image file to use for the component on the website. + # Step 5: Replace the task_template to the name of the task. + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/task_template/ + dest: resources_test/task_template + - type: s3 + path: s3://openproblems-data/resources_test/common/ + dest: resources_test/common + +# Step 6: Update the authors of the task. +authors: + # Full name of the author, usually in the name of FirstName MiddleName LastName. + - name: John Doe + # Role of the author. Possible values: + # + # * `"author"`: Authors who have made substantial contributions to the component. + # * `"maintainer"`: The maintainer of the component. + # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). + roles: [ "author", "maintainer" ] + # Additional information on the author + info: + github: johndoe + orcid: 0000-0000-0000-0000 + email: john@doe.me + twitter: johndoe + linkedin: johndoe + +# Step 7: Remove all of the comments of the steps you completed +# Step 8: High five yourself! + +config_mods: | + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } + +repositories: + - name: openproblems-v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build \ No newline at end of file diff --git a/common b/common new file mode 160000 index 00000000..67ef9612 --- /dev/null +++ b/common @@ -0,0 +1 @@ +Subproject commit 67ef9612fce1bbf22e07971e55a9858e8dd2dfa5 diff --git a/main.nf b/main.nf new file mode 100644 index 00000000..62f01409 --- /dev/null +++ b/main.nf @@ -0,0 +1,3 @@ +workflow { + print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.") +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 00000000..8fc6c4e3 --- /dev/null +++ b/nextflow.config @@ -0,0 +1 @@ +process.container = 'nextflow/bash:latest' \ No newline at end of file diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 00000000..2f7ffd34 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,3 @@ +add_a_method.sh +add_a_control_method.sh +add_a_metric.sh \ No newline at end of file diff --git a/scripts/aggregate_counts.py b/scripts_old/aggregate_counts.py similarity index 100% rename from scripts/aggregate_counts.py rename to scripts_old/aggregate_counts.py diff --git a/scripts/aggregate_group_metrics.py b/scripts_old/aggregate_group_metrics.py similarity index 100% rename from scripts/aggregate_group_metrics.py rename to scripts_old/aggregate_group_metrics.py diff --git a/scripts/aggregate_metrics.py b/scripts_old/aggregate_metrics.py similarity index 100% rename from scripts/aggregate_metrics.py rename to scripts_old/aggregate_metrics.py diff --git a/scripts/aggregate_tile_assignments.py b/scripts_old/aggregate_tile_assignments.py similarity index 100% rename from scripts/aggregate_tile_assignments.py rename to scripts_old/aggregate_tile_assignments.py diff --git a/scripts/annotate_celltypes.py b/scripts_old/annotate_celltypes.py similarity index 100% rename from scripts/annotate_celltypes.py rename to scripts_old/annotate_celltypes.py diff --git a/scripts/annotate_celltypes_FRmatch.r b/scripts_old/annotate_celltypes_FRmatch.r similarity index 100% rename from scripts/annotate_celltypes_FRmatch.r rename to scripts_old/annotate_celltypes_FRmatch.r diff --git a/scripts/annotate_celltypes_consensus_GMCS.py b/scripts_old/annotate_celltypes_consensus_GMCS.py similarity index 100% rename from scripts/annotate_celltypes_consensus_GMCS.py rename to scripts_old/annotate_celltypes_consensus_GMCS.py diff --git a/scripts/annotate_celltypes_consensus_NWCS.py b/scripts_old/annotate_celltypes_consensus_NWCS.py similarity index 100% rename from scripts/annotate_celltypes_consensus_NWCS.py rename to scripts_old/annotate_celltypes_consensus_NWCS.py diff --git a/scripts/annotate_celltypes_mapmycells.py b/scripts_old/annotate_celltypes_mapmycells.py similarity index 100% rename from scripts/annotate_celltypes_mapmycells.py rename to scripts_old/annotate_celltypes_mapmycells.py diff --git a/scripts/annotate_celltypes_mfishtools.r b/scripts_old/annotate_celltypes_mfishtools.r similarity index 100% rename from scripts/annotate_celltypes_mfishtools.r rename to scripts_old/annotate_celltypes_mfishtools.r diff --git a/scripts/annotate_celltypes_scrattchmapping.r b/scripts_old/annotate_celltypes_scrattchmapping.r similarity index 100% rename from scripts/annotate_celltypes_scrattchmapping.r rename to scripts_old/annotate_celltypes_scrattchmapping.r diff --git a/scripts/annotate_celltypes_tangram.py b/scripts_old/annotate_celltypes_tangram.py similarity index 100% rename from scripts/annotate_celltypes_tangram.py rename to scripts_old/annotate_celltypes_tangram.py diff --git a/scripts/annotate_counts.py b/scripts_old/annotate_counts.py similarity index 100% rename from scripts/annotate_counts.py rename to scripts_old/annotate_counts.py diff --git a/scripts/basic_assignment.py b/scripts_old/basic_assignment.py similarity index 100% rename from scripts/basic_assignment.py rename to scripts_old/basic_assignment.py diff --git a/scripts/calc_group_metrics.py b/scripts_old/calc_group_metrics.py similarity index 100% rename from scripts/calc_group_metrics.py rename to scripts_old/calc_group_metrics.py diff --git a/scripts/calc_metrics.py b/scripts_old/calc_metrics.py similarity index 100% rename from scripts/calc_metrics.py rename to scripts_old/calc_metrics.py diff --git a/scripts/calc_quality_metrics.py b/scripts_old/calc_quality_metrics.py similarity index 100% rename from scripts/calc_quality_metrics.py rename to scripts_old/calc_quality_metrics.py diff --git a/scripts/copy_celltypes_pciseqct.py b/scripts_old/copy_celltypes_pciseqct.py similarity index 100% rename from scripts/copy_celltypes_pciseqct.py rename to scripts_old/copy_celltypes_pciseqct.py diff --git a/scripts/gen_counts.py b/scripts_old/gen_counts.py similarity index 100% rename from scripts/gen_counts.py rename to scripts_old/gen_counts.py diff --git a/scripts/generate_tile.py b/scripts_old/generate_tile.py similarity index 100% rename from scripts/generate_tile.py rename to scripts_old/generate_tile.py diff --git a/scripts/normalize_sc.py b/scripts_old/normalize_sc.py similarity index 100% rename from scripts/normalize_sc.py rename to scripts_old/normalize_sc.py diff --git a/scripts/retrieve_tiles_info.py b/scripts_old/retrieve_tiles_info.py similarity index 100% rename from scripts/retrieve_tiles_info.py rename to scripts_old/retrieve_tiles_info.py diff --git a/scripts/run_baysor.py b/scripts_old/run_baysor.py similarity index 100% rename from scripts/run_baysor.py rename to scripts_old/run_baysor.py diff --git a/scripts/run_clustermap.py b/scripts_old/run_clustermap.py similarity index 100% rename from scripts/run_clustermap.py rename to scripts_old/run_clustermap.py diff --git a/scripts/run_mesmer.py b/scripts_old/run_mesmer.py similarity index 100% rename from scripts/run_mesmer.py rename to scripts_old/run_mesmer.py diff --git a/scripts/run_pciseq.py b/scripts_old/run_pciseq.py similarity index 100% rename from scripts/run_pciseq.py rename to scripts_old/run_pciseq.py diff --git a/scripts/segment_image.py b/scripts_old/segment_image.py similarity index 100% rename from scripts/segment_image.py rename to scripts_old/segment_image.py diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml new file mode 100644 index 00000000..4d767d8e --- /dev/null +++ b/src/api/comp_control_method.yaml @@ -0,0 +1,37 @@ +namespace: control_methods +info: + type: control_method + type_info: + label: Control Method + summary: Quality control methods for verifying the pipeline. + description: | + This folder contains control components for the task. + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. +arguments: + - name: --input_train + __merge__: file_train_h5ad.yaml + required: true + direction: input + - name: --input_test + __merge__: file_test_h5ad.yaml + required: true + direction: input + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas \ No newline at end of file diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml new file mode 100644 index 00000000..184bc548 --- /dev/null +++ b/src/api/comp_data_processor.yaml @@ -0,0 +1,31 @@ +namespace: "data_processors" +info: + type: data_processor + type_info: + label: Data processor + summary: A data processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train_h5ad.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test_h5ad.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py + diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml new file mode 100644 index 00000000..d7be9578 --- /dev/null +++ b/src/api/comp_method.yaml @@ -0,0 +1,28 @@ +namespace: "methods" +info: + type: method + type_info: + label: Method + summary: A method. + description: | + A method to predict the task effects. +arguments: + - name: --input_train + __merge__: file_train_h5ad.yaml + required: true + direction: input + - name: "--input_test" + __merge__: file_test_h5ad.yaml + direction: input + required: true + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml new file mode 100644 index 00000000..e3295da0 --- /dev/null +++ b/src/api/comp_metric.yaml @@ -0,0 +1,28 @@ +namespace: "metrics" +info: + type: metric + type_info: + label: Metric + summary: A task template metric. + description: | + A metric for evaluating method predictions. +arguments: + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--input_prediction" + __merge__: file_prediction.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - type: python_script + path: /common/component_tests/check_config.py + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml new file mode 100644 index 00000000..0927ea0a --- /dev/null +++ b/src/api/file_common_dataset.yaml @@ -0,0 +1,72 @@ +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +label: "Common Dataset" +summary: A subset of the common dataset. +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Cell type information + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml new file mode 100644 index 00000000..4a6dc328 --- /dev/null +++ b/src/api/file_prediction.yaml @@ -0,0 +1,26 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/prediction.h5ad" +label: "Predicted data" +summary: A predicted dataset as output by a method. +info: + format: + type: h5ad + obs: + - type: string + name: label_pred + description: Predicted labels for the test cells. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true \ No newline at end of file diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml new file mode 100644 index 00000000..f6022a83 --- /dev/null +++ b/src/api/file_score.yaml @@ -0,0 +1,31 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: resources/score.h5ad +label: Score +summary: "File indicating the score of a metric." +info: + format: + type: h5ad + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true \ No newline at end of file diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml new file mode 100644 index 00000000..81e168e9 --- /dev/null +++ b/src/api/file_solution.yaml @@ -0,0 +1,73 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/solution.h5ad" +label: "Solution" +summary: "The solution for the test data" +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml new file mode 100644 index 00000000..6ee21ac5 --- /dev/null +++ b/src/api/file_test_h5ad.yaml @@ -0,0 +1,45 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/test.h5ad" +label: "Test data" +summary: The subset of molecules used for the test dataset +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true \ No newline at end of file diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_train_h5ad.yaml new file mode 100644 index 00000000..7d2b51d5 --- /dev/null +++ b/src/api/file_train_h5ad.yaml @@ -0,0 +1,49 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/task_template/pancreas/train.h5ad" +label: "Training data" +summary: "The training data in h5ad format" +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true \ No newline at end of file diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml new file mode 100644 index 00000000..741e3f25 --- /dev/null +++ b/src/control_methods/true_labels/config.vsh.yaml @@ -0,0 +1,59 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_control_method.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: true_labels + +# A relatively short label, used when rendering visualisations (required) +label: True Labels +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + A positive control, where the solution labels are copied 1 to 1 to the predicted data. + +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: counts + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + # setup: + # - type: python + # packages: scib==1.1.5 + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py new file mode 100644 index 00000000..0a04aaf4 --- /dev/null +++ b/src/control_methods/true_labels/script.py @@ -0,0 +1,45 @@ +import anndata as ad + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_train': 'resources_test/task_template/pancreas/train.h5ad', + 'input_test': 'resources_test/task_template/pancreas/test.h5ad', + 'input_solution': 'resources_test/task_template/pancreas/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'true_labels' +} +## VIASH END + +print('Reading input files', flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Preprocess data', flush=True) +# ... preprocessing ... + +print('Train model', flush=True) +# ... train model ... + +print('Generate predictions', flush=True) +# ... generate predictions ... +obs_label_pred = input_solution.obs["label"] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_train.uns['dataset_id'], + 'normalization_id': input_train.uns['normalization_id'], + 'method_id': meta['name'] + }, + obs={ + 'label_pred': obs_label_pred + } +) +output.obs_names = input_test.obs_names + +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml new file mode 100644 index 00000000..a9977208 --- /dev/null +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_data_processor.yaml +name: process_dataset +arguments: + - name: "--method" + type: "string" + description: "The process method to assign train/test." + choices: ["batch", "random"] + default: "batch" + - name: "--obs_label" + type: "string" + description: "Which .obs slot to use as label." + default: "cell_type" + - name: "--obs_batch" + type: "string" + description: "Which .obs slot to use as batch covariate." + default: "batch" + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 +resources: + - type: python_script + path: script.py + - path: /common/helper_functions/subset_h5ad_by_format.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py new file mode 100644 index 00000000..350d5564 --- /dev/null +++ b/src/data_processors/process_dataset/script.py @@ -0,0 +1,86 @@ +import sys +import random +import numpy as np +import anndata as ad +import openproblems as op + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'method': 'batch', + 'seed': None, + 'obs_batch': 'batch', + 'obs_label': 'cell_type', + 'output_train': 'train.h5ad', + 'output_test': 'test.h5ad', + 'output_solution': 'solution.h5ad' +} +meta = { + 'resources_dir': 'target/executable/data_processors/process_dataset', + 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_h5ad_by_format import subset_h5ad_by_format + +config = op.project.read_viash_config(meta["config"]) + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print(">> Load data", flush=True) +adata = ad.read_h5ad(par["input"]) +print("input:", adata) + +print(f">> Process data using {par['method']} method") +if par["method"] == "batch": + batch_info = adata.obs[par["obs_batch"]] + batch_categories = batch_info.dtype.categories + test_batches = random.sample(list(batch_categories), 1) + is_test = [ x in test_batches for x in batch_info ] +elif par["method"] == "random": + train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) + is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] + +# subset the different adatas +print(">> Figuring which data needs to be copied to which output file", flush=True) +# use par arguments to look for label and batch value in different slots +slot_mapping = { + "obs": { + "label": par["obs_label"], + "batch": par["obs_batch"], + } +} + +print(">> Creating train data", flush=True) +output_train = subset_h5ad_by_format( + adata[[not x for x in is_test]], + config, + "output_train", + slot_mapping +) + +print(">> Creating test data", flush=True) +output_test = subset_h5ad_by_format( + adata[is_test], + config, + "output_test", + slot_mapping +) + +print(">> Creating solution data", flush=True) +output_solution = subset_h5ad_by_format( + adata[is_test], + config, + "output_solution", + slot_mapping +) + +print(">> Writing data", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +output_solution.write_h5ad(par["output_solution"]) diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml new file mode 100644 index 00000000..11109357 --- /dev/null +++ b/src/methods/logistic_regression/config.vsh.yaml @@ -0,0 +1,77 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_method.yaml + + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: logistic_regression +# A relatively short label, used when rendering visualisations (required) +label: Logistic Regression +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + Logistic Regression estimates parameters of a logistic function for + multivariate classification tasks. Here, we use 100-dimensional whitened PCA + coordinates as independent variables, and the model minimises the cross + entropy loss over all cell type classes. +# Metadata for your component +# A reference key from the bibtex library at src/common/library.bib (required). +references: + bibtex: + - | + @book{hosmer2013applied, + title = {Applied logistic regression}, + author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, + year = {2013}, + publisher = {John Wiley \& Sons}, + volume = {398} + } +info: + + # Which normalisation method this component prefers to use (required). + preferred_normalization: log_cp10k + # URL to the documentation for this method (required). + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + # URL to the code repository for this method (required). + repository_url: https://github.com/scikit-learn/scikit-learn + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: scikit-learn + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py new file mode 100644 index 00000000..cc851f8e --- /dev/null +++ b/src/methods/logistic_regression/script.py @@ -0,0 +1,46 @@ +import anndata as ad +import sklearn.linear_model + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_train': 'resources_test/task_template/pancreas/train.h5ad', + 'input_test': 'resources_test/task_template/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'logistic_regression' +} +## VIASH END + +print('Reading input files', flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print('Preprocess data', flush=True) +# ... preprocessing ... + +print('Train model', flush=True) +# ... train model ... +classifier = sklearn.linear_model.LogisticRegression() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) + +print('Generate predictions', flush=True) +# ... generate predictions ... +obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_train.uns['dataset_id'], + 'normalization_id': input_train.uns['normalization_id'], + 'method_id': meta['name'] + }, + obs={ + 'label_pred': obs_label_pred + } +) +output.obs_names = input_test.obs_names + +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml new file mode 100644 index 00000000..66fa8359 --- /dev/null +++ b/src/metrics/accuracy/config.vsh.yaml @@ -0,0 +1,70 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: accuracy + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: accuracy + # A relatively short label, used when rendering visualisarions (required) + label: Accuracy + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "The percentage of correctly predicted labels." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + The percentage of correctly predicted labels. + # A reference key from the bibtex library at src/common/library.bib (required). + references: + doi: 10.48550/arXiv.2008.05756 + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: 1 + # Whether a higher value represents a 'better' solution (required) + maximize: true + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: scikit-learn + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py new file mode 100644 index 00000000..72dcb1e5 --- /dev/null +++ b/src/metrics/accuracy/script.py @@ -0,0 +1,47 @@ +import anndata as ad +import numpy as np +import sklearn.preprocessing + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_solution': 'resources_test/task_template/pancreas/solution.h5ad', + 'input_prediction': 'resources_test/task_template/pancreas/prediction.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'accuracy' +} +## VIASH END + +print('Reading input files', flush=True) +input_solution = ad.read_h5ad(par['input_solution']) +input_prediction = ad.read_h5ad(par['input_prediction']) + +assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +print("Encode labels", flush=True) +cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) +encoder = sklearn.preprocessing.LabelEncoder().fit(cats) +input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) +input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) + + +print('Compute metrics', flush=True) +# metric_ids and metric_values can have length > 1 +# but should be of equal length +uns_metric_ids = [ 'accuracy' ] +uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_prediction.uns['dataset_id'], + 'normalization_id': input_prediction.uns['normalization_id'], + 'method_id': input_prediction.uns['method_id'], + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 00000000..71c1b9be --- /dev/null +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,40 @@ +name: process_datasets +namespace: workflows + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/api/file_common_dataset.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_train" + __merge__: /src/api/file_train_h5ad.yaml + required: true + direction: output + - name: "--output_test" + __merge__: /src/api/file_test_h5ad.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/api/file_solution.yaml + required: true + direction: output + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf + +dependencies: + - name: common/check_dataset_schema + repository: openproblems-v2 + - name: common/extract_metadata + repository: openproblems-v2 + - name: data_processors/process_dataset + +runners: + - type: nextflow diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf new file mode 100644 index 00000000..eae19f7c --- /dev/null +++ b/src/workflows/process_datasets/main.nf @@ -0,0 +1,173 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStatesTemp(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test", "output_solution"]) + + emit: + output_ch +} + + +// temp fix for rename_keys typo + +def findStatesTemp(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesTempWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesTempWf +} \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 00000000..12a4602c --- /dev/null +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,74 @@ +name: run_benchmark +namespace: workflows + +argument_groups: + - name: Inputs + arguments: + - name: "--input_train" + __merge__: /src/api/file_train_h5ad.yaml + type: file + direction: input + required: true + - name: "--input_test" + __merge__: /src/api/file_test_h5ad.yaml + type: file + direction: input + required: true + - name: "--input_solution" + __merge__: /src/api/file_solution.yaml + type: file + direction: input + required: true + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /_viash.yaml + +dependencies: + - name: common/check_dataset_schema + repository: openproblems-v2 + - name: common/extract_metadata + repository: openproblems-v2 + - name: control_methods/true_labels + - name: methods/logistic_regression + - name: metrics/accuracy + +runners: + - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf new file mode 100644 index 00000000..68e5ecd3 --- /dev/null +++ b/src/workflows/run_benchmark/main.nf @@ -0,0 +1,311 @@ +workflow auto { + findStatesTemp(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + true_labels, + logistic_regression + ] + + // construct list of metrics + metrics = [ + accuracy + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_train: state.input_train, + input_test: state.input_test + ] + if (comp.config.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_solution: "input_solution", + input_prediction: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.name, + metric_output: output.output + ] + } + ) + + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def viash_file = meta.resources_dir.resolve("_viash.yaml") + def viash_file_content = toYamlBlob(readYaml(viash_file).info) + def task_info_file = tempFile("task_info.yaml") + task_info_file.write(viash_file_content) + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} + +// temp fix for rename_keys typo + +def findStatesTemp(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesTempWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesTempWf +} \ No newline at end of file