add methods

openproblems-bio · Jun 27, 2024 · c6093f6 · c6093f6
1 parent 5639642
commit c6093f6
Show file tree

Hide file tree

Showing 12 changed files with 474 additions and 102 deletions.
diff --git a/src/methods/alra/config.vsh.yaml b/src/methods/alra/config.vsh.yaml
@@ -0,0 +1,44 @@
+__merge__: ../../api/comp_method.yaml
+
+name: "alra"
+info:
+  label: ALRA
+  summary: "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation, thresholding by gene, and rescaling the matrix."
+  description: |
+    Adaptively-thresholded Low Rank Approximation (ALRA). 
+    
+    ALRA is a method for imputation of missing values in single cell RNA-sequencing data, 
+    described in the preprint, "Zero-preserving imputation of scRNA-seq data using low-rank approximation" 
+    available [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given a 
+    scRNA-seq expression matrix, ALRA first computes its rank-k approximation using randomized SVD. 
+    Next, each row (gene) is thresholded by the magnitude of the most negative value of that gene. 
+    Finally, the matrix is rescaled.
+  reference: "linderman2018zero"
+  repository_url: "https://github.com/KlugerLab/ALRA"
+  documentation_url: https://github.com/KlugerLab/ALRA/blob/master/README.md
+  v1:
+    path: openproblems/tasks/denoising/methods/alra.py
+    commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32
+  variants: 
+    alra:
+  preferred_normalization: counts
+arguments:
+  - name: "--norm"
+    type: string
+    choices: ["sqrt", "log"]
+    default: "log"
+    description: Normalization method
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.4
+    setup:
+      - type: r
+        cran: [ Matrix, rsvd ]
+        github: KlugerLab/ALRA
+runners:
+  - type: nextflow
+    directives: 
+      label: [midtime, highmem, highcpu]
diff --git a/src/methods/alra/script.R b/src/methods/alra/script.R
@@ -0,0 +1,53 @@
+cat(">> Loading dependencies\n")
+library(anndata, warn.conflicts = FALSE)
+library(ALRA, warn.conflicts = FALSE)
+
+## VIASH START
+par <- list(
+  input_train = "resources_test/denoising/pancreas/train.h5ad",
+  norm = "log",
+  output = "output.h5ad"
+)
+meta <- list(
+  functionality_name = "alra"
+)
+## VIASH END
+
+cat(">> Load input data\n")
+input_train <- read_h5ad(par$input_train, backed = "r")
+
+cat(">> Set normalization method\n")
+if (par$norm == "sqrt") {
+  norm_fn <- sqrt
+  denorm_fn <- function(x) x^2
+} else if (par$norm == "log") {
+  norm_fn <- log1p
+  denorm_fn <- expm1
+} else {
+  stop("Unknown normalization method: ", par$norm)
+}
+
+cat(">> Normalize data\n")
+data <- as.matrix(input_train$layers[["counts"]])
+totalPerCell <- rowSums(data)
+data <- sweep(data, 1, totalPerCell, "/")
+data <- norm_fn(data)
+
+cat(">> Run ALRA\n")
+data <- alra(data)$A_norm_rank_k_cor_sc
+data <- denorm_fn(data)
+data <- sweep(data, 1, totalPerCell, "*")
+
+cat(">> Store output\n")
+output <- AnnData(
+  layers = list(denoised = data),
+  obs = input_train$obs[, c(), drop = FALSE],
+  var = input_train$var[, c(), drop = FALSE],
+  uns = list(
+    dataset_id = input_train$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  )
+)
+
+cat(">> Write output to file\n")
+output$write_h5ad(par$output, compression = "gzip")
diff --git a/src/methods/dca/config.vsh.yaml b/src/methods/dca/config.vsh.yaml
@@ -0,0 +1,45 @@
+__merge__: ../../api/comp_method.yaml
+name: "dca"
+info:
+  label: DCA
+  summary: "A deep autoencoder with ZINB loss function to address the dropout effect in count data"
+  description: |
+    "Deep Count Autoencoder
+
+    Removes the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account 
+    using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function."
+  reference: "eraslan2019single"
+  documentation_url: "https://github.com/theislab/dca#readme"
+  repository_url: "https://github.com/theislab/dca"
+  v1:
+    path: openproblems/tasks/denoising/methods/dca.py
+    commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32
+  variants: 
+    dca:
+  preferred_normalization: counts
+arguments:
+  - name: "--epochs"
+    type: "integer"
+    default: 300
+    description: "Number of total epochs in training"
+resources:
+  - type: python_script
+    path: script.py
+engines:
+  - type: docker
+    image: python:3.9
+    setup:
+      - type: apt
+        packages: procps
+      - type: python
+        packages:
+          - anndata~=0.8.0
+          - scanpy
+          - pyyaml
+          - requests
+          - jsonschema
+          - "git+https://github.com/scottgigante-immunai/dca.git@patch-1"
+runners:
+  - type: nextflow
+    directives: 
+      label: [midtime, highmem, highcpu]
diff --git a/src/methods/dca/script.py b/src/methods/dca/script.py
@@ -0,0 +1,39 @@
+import anndata as ad
+from dca.api import dca
+
+## VIASH START
+par = {
+    'input_train': 'resources_test/denoising/pancreas/train.h5ad',
+    'output': 'output_dca.h5ad',
+    'epochs': 300,
+}
+meta = {
+    'functionality_name': 'dca',
+}
+## VIASH END
+
+print("load input data", flush=True)
+input_train = ad.read_h5ad(par['input_train'], backed="r")
+
+print("Remove unneeded data", flush=True)
+output = ad.AnnData(
+    X=input_train.layers["counts"],
+    obs=input_train.obs[[]],
+    var=input_train.var[[]],
+    uns={
+        "dataset_id": input_train.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+
+del input_train
+
+print("Run DCA", flush=True)
+dca(output, epochs=par["epochs"])
+
+print("Move output to correct location", flush=True)
+output.layers["denoised"] = output.X
+del output.X
+
+print("Writing data", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/methods/knn_smoothing/config.vsh.yaml b/src/methods/knn_smoothing/config.vsh.yaml
@@ -0,0 +1,43 @@
+__merge__: ../../api/comp_method.yaml
+
+name: "knn_smoothing"
+info:
+  label: KNN Smoothing
+  summary: "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing the size of neighbourhoods for smoothing until a maximum k value is reached."
+  description: "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq
+      expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first
+      applies initial normalisation and smoothing. Then, a chosen number of
+      principal components is used to calculate Euclidean distances between cells.
+      Minimally sized neighbourhoods are initially determined from these Euclidean
+      distances, and expression profiles are shared between neighbouring cells.
+      Then, the resultant smoothed matrix is used as input to the next step of
+      smoothing, where the size (k) of the considered neighbourhoods is increased,
+      leading to greater smoothing. This process continues until a chosen maximum k
+      value has been reached, at which point the iteratively smoothed object is
+      then optionally scaled to yield a final result."
+  reference: "wagner2018knearest"
+  documentation_url: "https://github.com/yanailab/knn-smoothing#readme"
+  repository_url: "https://github.com/yanailab/knn-smoothing"
+  v1:
+    path: openproblems/tasks/denoising/methods/knn_smoothing.py
+    commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32
+  variants: 
+    knn_smoothing:
+  preferred_normalization: counts
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages:
+          - scipy
+        github:
+          - scottgigante-immunai/knn-smoothing@python_package
+runners:
+  - type: nextflow
+    directives: 
+      label: [midtime, highmem, highcpu]
diff --git a/src/methods/knn_smoothing/script.py b/src/methods/knn_smoothing/script.py
@@ -0,0 +1,39 @@
+import knn_smooth
+import anndata as ad
+
+## VIASH START
+par = {
+    'input_train': 'resources_test/denoising/pancreas/train.h5ad',
+    'output': 'output_knn.h5ad',
+}
+meta = {
+    'functionality_name': 'foo',
+}
+## VIASH END
+
+print("Load input data", flush=True)
+input_train = ad.read_h5ad(par["input_train"], backed="r")
+
+print("Remove unneeded data", flush=True)
+X = input_train.layers["counts"].astype(float).transpose().toarray()
+
+# Create output AnnData for later use
+output = ad.AnnData(
+    obs=input_train.obs[[]],
+    var=input_train.var[[]],
+    uns={
+        "dataset_id": input_train.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    }
+)
+
+del input_train
+
+print("Run KNN smoothing", flush=True)
+X = knn_smooth.knn_smoothing(X, k=10).transpose()
+
+print("Process data", flush=True)
+output.layers["denoised"] = X
+
+print("Writing data", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/methods/magic/config.vsh.yaml b/src/methods/magic/config.vsh.yaml
@@ -0,0 +1,63 @@
+__merge__: ../../api/comp_method.yaml
+name: "magic"
+info:
+  label: MAGIC
+  summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone."
+  description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for
+      imputation and denoising of noisy or dropout-prone single cell RNA-sequencing
+      data. Given a normalised scRNA-seq expression matrix, it first calculates
+      Euclidean distances between each pair of cells in the dataset, which is then
+      augmented using a Gaussian kernel (function) and row-normalised to give a
+      normalised affinity matrix. A t-step markov process is then calculated, by
+      powering this affinity matrix t times. Finally, the powered affinity matrix
+      is right-multiplied by the normalised data, causing the final imputed values
+      to take the value of a per-gene average weighted by the affinities of cells.
+      The resultant imputed matrix is then rescaled, to more closely match the
+      magnitude of measurements in the normalised (input) matrix."
+  reference: "van2018recovering"
+  documentation_url: "https://github.com/KrishnaswamyLab/MAGIC#readme"
+  repository_url: "https://github.com/KrishnaswamyLab/MAGIC"
+  v1:
+    path: openproblems/tasks/denoising/methods/magic.py
+    commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32
+  variants: 
+    magic:
+    magic_approx:
+      solver: approximate
+    magic_knn_naive:
+      norm: log
+      decay: none
+      t: 1
+  preferred_normalization: counts
+arguments:
+  - name: "--solver"
+    type: "string"
+    choices: ["exact", "approximate"]
+    default: "exact"
+    description: Which solver to use.
+  - name: "--norm"
+    type: string
+    choices: ["sqrt", "log"]
+    default: "log"
+    description: Normalization method
+  - name: "--decay"
+    type: integer
+    default: 1
+    description: sets decay rate of kernel tails
+  - name: "--t"
+    type: integer
+    default: 3
+    description: power to which the diffusion operator is powered
+resources:
+  - type: python_script
+    path: script.py
+engines:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        pip: [scprep, magic-impute, scipy, scikit-learn<1.2]
+runners:
+  - type: nextflow
+    directives: 
+      label: [midtime, highmem, highcpu]