diff --git a/_viash.yaml b/_viash.yaml index 6380704..4b9a7d6 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,9 +1,9 @@ -viash_version: 0.9.0-RC6 - name: task_denoising +version: dev + organization: openproblems-bio description: | - An OpenProblems benchmark task. + Removing noise in sparse single-cell RNA-sequencing count data. license: MIT keywords: [single-cell, openproblems, benchmark, denoising] links: @@ -12,6 +12,37 @@ links: docker_registry: ghcr.io info: + label: Denoising + summary: "Removing noise in sparse single-cell RNA-sequencing count data" + image: /src/api/thumbnail.svg + motivation: | + Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present + in each cell. As a result, the measurements (UMI counts) observed for each gene and each + cell are associated with generally high levels of technical noise ([Grün et al., + 2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of + estimating the true expression level of each gene in each cell. In the single-cell + literature, this task is also referred to as *imputation*, a term which is typically + used for missing data problems in statistics. Similar to the use of the terms "dropout", + "missing data", and "technical zeros", this terminology can create confusion about the + underlying measurement process ([Sarkar and Stephens, + 2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). + description: | + A key challenge in evaluating denoising methods is the general lack of a ground truth. A + recent benchmark study ([Hou et al., + 2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) + relied on flow-sorted datasets, mixture control experiments ([Tian et al., + 2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk + RNA-Seq data. Since each of these approaches suffers from specific limitations, it is + difficult to combine these different approaches into a single quantitative measure of + denoising accuracy. Here, we instead rely on an approach termed molecular + cross-validation (MCV), which was specifically developed to quantify denoising accuracy + in the absence of a ground truth ([Batson et al., + 2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules + in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* + dataset. Next, a denoising method is applied to the training dataset. Finally, denoising + accuracy is measured by comparing the result to the test dataset. The authors show that + both in theory and in practice, the measured denoising accuracy is representative of the + accuracy that would be obtained on a ground truth dataset. test_resources: - type: s3 path: s3://openproblems-data/resources_test/denoising/ @@ -19,9 +50,28 @@ info: - type: s3 path: s3://openproblems-data/resources_test/common/ dest: resources_test/common +authors: + - name: "Wesley Lewis" + roles: [ author, maintainer ] + info: + github: wes-lewis + - name: "Scott Gigante" + roles: [ author, maintainer ] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Robrecht Cannoodt + roles: [ author ] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Kai Waldrant + roles: [ contributor ] + info: + github: KaiWaldrant + orcid: "0009-0003-8555-1361" - -version: dev +viash_version: 0.9.0-RC6 config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } diff --git a/common b/common index d4fd76b..fab42b1 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit d4fd76b71c73d6d54d08604a0cc3a8cff94f70d8 +Subproject commit fab42b17d1d88317e42bef4849f6ef16687e3531 diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml deleted file mode 100644 index 407f953..0000000 --- a/src/api/task_info.yaml +++ /dev/null @@ -1,53 +0,0 @@ -name: denoising -label: Denoising -summary: "Removing noise in sparse single-cell RNA-sequencing count data" -image: "thumbnail.svg" -motivation: | - Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present - in each cell. As a result, the measurements (UMI counts) observed for each gene and each - cell are associated with generally high levels of technical noise ([Grün et al., - 2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of - estimating the true expression level of each gene in each cell. In the single-cell - literature, this task is also referred to as *imputation*, a term which is typically - used for missing data problems in statistics. Similar to the use of the terms "dropout", - "missing data", and "technical zeros", this terminology can create confusion about the - underlying measurement process ([Sarkar and Stephens, - 2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). -description: | - A key challenge in evaluating denoising methods is the general lack of a ground truth. A - recent benchmark study ([Hou et al., - 2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) - relied on flow-sorted datasets, mixture control experiments ([Tian et al., - 2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk - RNA-Seq data. Since each of these approaches suffers from specific limitations, it is - difficult to combine these different approaches into a single quantitative measure of - denoising accuracy. Here, we instead rely on an approach termed molecular - cross-validation (MCV), which was specifically developed to quantify denoising accuracy - in the absence of a ground truth ([Batson et al., - 2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules - in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* - dataset. Next, a denoising method is applied to the training dataset. Finally, denoising - accuracy is measured by comparing the result to the test dataset. The authors show that - both in theory and in practice, the measured denoising accuracy is representative of the - accuracy that would be obtained on a ground truth dataset. - -authors: - - name: "Wesley Lewis" - roles: [ author, maintainer ] - info: - github: wes-lewis - - name: "Scott Gigante" - roles: [ author, maintainer ] - info: - github: scottgigante - orcid: "0000-0002-4544-2764" - - name: Robrecht Cannoodt - roles: [ author ] - info: - github: rcannood - orcid: "0000-0003-3641-729X" - - name: Kai Waldrant - roles: [ contributor ] - info: - github: KaiWaldrant - orcid: "0009-0003-8555-1361" \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 02ee2ae..3d1b6bc 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -50,7 +50,7 @@ resources: path: main.nf entrypoint: run_wf - type: file - path: "../../api/task_info.yaml" + path: /_viash.yaml dependencies: - name: common/check_dataset_schema repository: openproblems-v2 diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 600fc12..14d2494 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -153,7 +153,7 @@ workflow run_wf { def metric_configs_file = tempFile("metric_configs.yaml") metric_configs_file.write(metric_configs_yaml_blob) - def task_info_file = meta.resources_dir.resolve("task_info.yaml") + def task_info_file = meta.resources_dir.resolve("_viash.yaml") // store the scores in a file def score_uns = states.collect{it.score_uns}