norm and batch corrections are seperated

openproblems-bio · Jul 30, 2024 · 078f2ca · 078f2ca
1 parent 5bc6fc0
commit 078f2ca
Show file tree

Hide file tree

Showing 9 changed files with 163 additions and 63 deletions.
diff --git a/scripts/generate_resources.sh b/scripts/generate_resources.sh
@@ -14,8 +14,19 @@ viash run src/process_data/test_data_counts/config.novsh.yaml
 echo ">> process multiome"
 viash run src/process_data/multiomics/multiome/config.vsh.yaml 
 
-echo ">> process perturbation data"
-viash run src/process_data/perturbation/sc_counts/config.vsh.yaml 
+echo ">> Preprocess perturbation data"
+echo "> QC, pseudobulk, and filter"
+# viash run src/process_data/perturbation/sc_counts/config.vsh.yaml 
+python src/process_data/perturbation/sc_counts/script.py
+echo "> Normalize counts"
+# viash run src/process_data/perturbation/normalization/config.vsh.yaml 
+python src/process_data/perturbation/normalization/script.py
+echo "> Batch correction using scgen"
+# viash run src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml 
+python src/process_data/perturbation/batch_correction_scgen/script.py
+echo "> Batch correction using seurat"
+viash run src/process_data/perturbation/batch_correction_seurat/config.vsh.yaml 
+
 
 
 # echo ">> Perturbation data: batch correction" TODO"
@@ -24,6 +35,7 @@ viash run src/process_data/perturbation/sc_counts/config.vsh.yaml
 
 # echo ">> process supp: TODO"
 
+echo ">> Format multiomics data for R "
 echo ">> Extract matrix and annotations from multiome "
 viash run  src/process_data/multiomics/multiome_matrix/config.vsh.yaml 
 
@@ -32,7 +44,7 @@ echo ">> Construct rds files from multiomics count matrix and annotations"
 viash run  src/process_data/multiomics/multiome_r/config.vsh.yaml 
 
 
-echo ">> create test resources "
+echo ">> Create test resources"
 echo ">> Extract matrix and annotations from multiome: test data "
 viash run  src/process_data/multiomics/multiome_matrix/config.vsh.yaml --  --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad \
     --multiomics_atac resources_test/grn-benchmark/multiomics_atac.h5ad \

diff --git a/src/process_data/perturbation/batch_correction_scgen/config.novsh.yaml b/src/process_data/perturbation/batch_correction_scgen/config.novsh.yaml
@@ -0,0 +1,76 @@
+functionality:
+  name: batch_correction_scgen
+  info:
+    label: batch_correction_scgen
+    summary: "Correct batch effects using scgen"
+
+  arguments:
+    - name: --perturbation_data_n
+      type: file
+      info:
+        label: perturbation
+        summary: "Perturbation dataset for benchmarking."
+        file_type: h5ad
+        slots:
+          layers:
+            - name: n_counts
+              type: double
+              description: "Pseudobulked values using mean approach"
+              required: true
+            - name: pearson
+              type: double
+              description: "Normalized values using pearson residuals"
+              required: true
+            - name: lognorm
+              type: double
+              description: "Normalized values using shifted logarithm "
+              required: true
+      required: false
+      direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad 
+    - name: --perturbation_data_bc
+      type: file
+      info:
+        label: perturbation
+        summary: "Perturbation dataset for benchmarking."
+        file_type: h5ad
+        slots:
+          layers:
+            - name: n_counts
+              type: double
+              description: "Pseudobulked values using mean approach"
+              required: true
+            - name: pearson
+              type: double
+              description: "Normalized values using pearson residuals"
+              required: true
+            - name: lognorm
+              type: double
+              description: "Normalized values using shifted logarithm "
+              required: true
+            - name: scgen_lognorm
+              type: double
+              description: "Batch correction using scgen on lognorm data"
+              required: true
+            - name: scgen_pearson
+              type: double
+              description: "Batch correction using scgen on pearson data"
+              required: true
+      required: false
+      direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad 
+
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages: [scgen]
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/...urbation/batch_corrrection_scgen/script.R → ...turbation/batch_correction_scgen/script.R b/...urbation/batch_corrrection_scgen/script.R → ...turbation/batch_correction_scgen/script.R
diff --git a/...rbation/batch_corrrection_scgen/script.py → ...urbation/batch_correction_scgen/script.py b/...rbation/batch_corrrection_scgen/script.py → ...urbation/batch_correction_scgen/script.py
@@ -5,11 +5,12 @@
 import scgen
 
 par = {
-    'perturbation_data': 'resources/raw-data/perturbation_data.h5ad',
+    'perturbation_data_n': 'resources/raw-data/perturbation_data.h5ad',
+    "perturbation_data_bc": 'resources/raw-data/perturbation_data.h5ad'
 }
 
 
-bulk_adata = ad.read_h5ad(par['perturbation_data'])
+bulk_adata = ad.read_h5ad(par['perturbation_data_n'])
 print(bulk_adata)
 
 for norm_name in ['lognorm', 'pearson']:
@@ -30,4 +31,4 @@
     corrected_adata = model.batch_removal()
 
     bulk_adata.layers[f'scgen_{norm_name}'] = corrected_adata.X
-bulk_adata.write_h5ad(par['perturbation_data'])
+bulk_adata.write_h5ad(par['perturbation_data_bc'])
diff --git a/src/process_data/perturbation/batch_corrrection_scgen/config.novsh.yaml b/src/process_data/perturbation/batch_corrrection_scgen/config.novsh.yaml
diff --git a/src/process_data/perturbation/normalization/config.vsh.yaml b/src/process_data/perturbation/normalization/config.vsh.yaml
@@ -7,9 +7,46 @@ functionality:
 
   arguments:
 
-    - name: --perturbation_data
-      type: file 
-      required: true
+    - name: --perturbation_data_f
+      type: file
+      info:
+        label: perturbation
+        summary: "Perturbation dataset for benchmarking."
+        file_type: h5ad
+        slots:
+          layers:
+            - name: n_counts
+              type: double
+              description: "Pseudobulked values using mean approach"
+              required: true
+
+      required: false
+      direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad 
+
+    - name: --perturbation_data_n
+      type: file
+      info:
+        label: perturbation
+        summary: "Perturbation dataset for benchmarking."
+        file_type: h5ad
+        slots:
+          layers:
+            - name: n_counts
+              type: double
+              description: "Pseudobulked values using mean approach"
+              required: true
+            - name: pearson
+              type: double
+              description: "Normalized values using pearson residuals"
+              required: true
+            - name: lognorm
+              type: double
+              description: "Normalized values using shifted logarithm "
+              required: true
+      required: false
+      direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad 
 
 
   resources:

diff --git a/src/process_data/perturbation/normalization/script.py b/src/process_data/perturbation/normalization/script.py
@@ -9,7 +9,8 @@
 import scanpy as sc
 
 par = {
-    'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
+    'perturbation_data_f': 'resources/grn-benchmark/perturbation_data.h5ad',
+    'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad'
 }
 
 def normalize_func(bulk_adata):
@@ -27,7 +28,9 @@ def normalize_func(bulk_adata):
     bulk_adata.layers['lognorm'] = bulk_adata_c.X
 
     return bulk_adata
-    
-
+print("reading the file")
+bulk_adata_filtered = ad.read_h5ad(par['perturbation_data_f'])
 bulk_adata_n = normalize_func(bulk_adata_filtered)
-bulk_adata_n.write(par['perturbation_data'])
+print("Normalizing completed")
+print("Writing the file")
+bulk_adata_n.write(par['perturbation_data_n'])
diff --git a/src/process_data/perturbation/sc_counts/config.vsh.yaml b/src/process_data/perturbation/sc_counts/config.vsh.yaml
@@ -4,24 +4,32 @@ functionality:
     label: perturbation_counts
     summary: "Porcesses sc counts of perturbation data to generate pseudobulked"
     # description: |
-    # "
-    # It conducts QC on sc level to remove low quality cell and genes. 
-    # Then, sc counts are pseudobulked and filtered for outliers compounds, samples with low quality cells, and genes with low coverage. 
-    # Finally, it normalized the counts data. 
-    # "
-
+    #   It conducts QC on sc level to remove low quality cell and genes. 
+    #   Then, sc counts are pseudobulked and filtered for outliers compounds, samples with low quality cells, and genes with low coverage. 
+    #   Finally, it normalized the counts data. 
+
   arguments:
     - name: --perturbation_counts
       type: file 
       required: false
       direction: input
-      default: resources/datasets_raw/perturbation_counts.h5ad 
+      default: resources_test/datasets_raw/perturbation_counts.h5ad 
 
-    - name: --perturbation_data
-      type: file 
+    - name: --perturbation_data_f
+      type: file
+      info:
+        label: perturbation
+        summary: "Perturbation dataset for benchmarking."
+        file_type: h5ad
+        slots:
+          layers:
+            - name: n_counts
+              type: double
+              description: "Pseudobulked values using mean approach"
+              required: true
       required: false
       direction: output
-      default: resources/grn-benchmark/perturbation_data.h5ad 
+      default: resources_test/grn-benchmark/perturbation_data.h5ad 
 
 
   resources:

diff --git a/src/process_data/perturbation/sc_counts/script.py b/src/process_data/perturbation/sc_counts/script.py
@@ -10,7 +10,7 @@
 
 par = {
     'sc_counts': 'resources/datasets_raw/perturbation_counts.h5ad',
-    'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
+    'perturbation_data_f': 'resources/grn-benchmark/perturbation_data.h5ad',
 }
 
 def preprocess_sc(par):
@@ -186,25 +186,10 @@ def filter_func(bulk_adata):
     for key in ['cell_type','plate_name']:
         bulk_adata_filtered.obs[key] = bulk_adata_filtered.obs[key].astype(str)
     return bulk_adata_filtered
-def normalize_func(bulk_adata):
-    # normalize data based on mean pseudobulk
-    bulk_adata.X = bulk_adata.layers['counts'].copy()
-
-    bulk_adata_c = bulk_adata.copy()
-    sc.experimental.pp.normalize_pearson_residuals(bulk_adata_c)
-    bulk_adata.layers['pearson'] = bulk_adata_c.X
-
-    bulk_adata_c = bulk_adata.copy()
-    sc.pp.normalize_total(bulk_adata_c)
-    sc.pp.log1p(bulk_adata_c)
-    sc.pp.scale(bulk_adata_c)
-    bulk_adata.layers['lognorm'] = bulk_adata_c.X
-
-    return bulk_adata
+
 
 sc_counts_f = preprocess_sc(par)
 bulk_adata = pseudobulk_sum_func(sc_counts_f)
 bulk_adata = pseudobulk_mean_func(bulk_adata)
 bulk_adata_filtered = filter_func(bulk_adata)
-bulk_adata_n = normalize_func(bulk_adata_filtered)
-bulk_adata_n.write(par['perturbation_data'])
+bulk_adata_filtered.write(par['perturbation_data_f'])