Merge pull request #11 from openproblems-bio/feature/no-ref/update-to…

…-task_template update submodule
openproblems-bio · Aug 9, 2024 · 77e65bb · 77e65bb
2 parents f2bb633 + 49d0370
commit 77e65bb
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 61 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -1,9 +1,9 @@
-viash_version: 0.9.0-RC6
-
 name: task_denoising
+version: dev
+
 organization: openproblems-bio
 description: |
-  An OpenProblems benchmark task.
+  Removing noise in sparse single-cell RNA-sequencing count data.
 license: MIT
 keywords: [single-cell, openproblems, benchmark, denoising]
 links:
@@ -12,16 +12,66 @@ links:
   docker_registry: ghcr.io
 
 info:
+  label: Denoising
+  summary: "Removing noise in sparse single-cell RNA-sequencing count data"
+  image: /src/api/thumbnail.svg
+  motivation: |
+    Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present
+    in each cell. As a result, the measurements (UMI counts) observed for each gene and each
+    cell are associated with generally high levels of technical noise ([Grün et al.,
+    2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of
+    estimating the true expression level of each gene in each cell. In the single-cell
+    literature, this task is also referred to as *imputation*, a term which is typically
+    used for missing data problems in statistics. Similar to the use of the terms "dropout",
+    "missing data", and "technical zeros", this terminology can create confusion about the
+    underlying measurement process ([Sarkar and Stephens,
+    2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)).
+  description: |
+    A key challenge in evaluating denoising methods is the general lack of a ground truth. A
+    recent benchmark study ([Hou et al.,
+    2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x))
+    relied on flow-sorted datasets, mixture control experiments ([Tian et al.,
+    2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk
+    RNA-Seq data. Since each of these approaches suffers from specific limitations, it is
+    difficult to combine these different approaches into a single quantitative measure of
+    denoising accuracy. Here, we instead rely on an approach termed molecular
+    cross-validation (MCV), which was specifically developed to quantify denoising accuracy
+    in the absence of a ground truth ([Batson et al.,
+    2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules
+    in a given scRNA-Seq dataset are first partitioned between a *training* and a *test*
+    dataset. Next, a denoising method is applied to the training dataset. Finally, denoising
+    accuracy is measured by comparing the result to the test dataset. The authors show that
+    both in theory and in practice, the measured denoising accuracy is representative of the
+    accuracy that would be obtained on a ground truth dataset.
   test_resources:
     - type: s3
       path: s3://openproblems-data/resources_test/denoising/
       dest: resources_test/denoising
     - type: s3
       path: s3://openproblems-data/resources_test/common/
       dest: resources_test/common
+authors: 
+  - name: "Wesley Lewis"
+    roles: [ author, maintainer ]
+    info:
+      github: wes-lewis
+  - name: "Scott Gigante"
+    roles: [ author, maintainer ]
+    info:
+      github: scottgigante
+      orcid: "0000-0002-4544-2764"
+  - name: Robrecht Cannoodt
+    roles: [ author ]
+    info:
+      github: rcannood
+      orcid: "0000-0003-3641-729X"
+  - name: Kai Waldrant
+    roles: [ contributor ]
+    info:
+      github: KaiWaldrant
+      orcid: "0009-0003-8555-1361"
 
-
-version: dev
+viash_version: 0.9.0-RC6
 
 config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
diff --git a/common b/common
diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -50,7 +50,7 @@ resources:
     path: main.nf
     entrypoint: run_wf
   - type: file
-    path: "../../api/task_info.yaml"
+    path: /_viash.yaml
 dependencies:
   - name: common/check_dataset_schema
     repository: openproblems-v2

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -153,7 +153,7 @@ workflow run_wf {
       def metric_configs_file = tempFile("metric_configs.yaml")
       metric_configs_file.write(metric_configs_yaml_blob)
 
-      def task_info_file = meta.resources_dir.resolve("task_info.yaml")
+      def task_info_file = meta.resources_dir.resolve("_viash.yaml")
 
       // store the scores in a file
       def score_uns = states.collect{it.score_uns}