Skip to content

Commit

Permalink
norm and batch corrections are seperated
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Jul 30, 2024
1 parent 5bc6fc0 commit 078f2ca
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 63 deletions.
18 changes: 15 additions & 3 deletions scripts/generate_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,19 @@ viash run src/process_data/test_data_counts/config.novsh.yaml
echo ">> process multiome"
viash run src/process_data/multiomics/multiome/config.vsh.yaml

echo ">> process perturbation data"
viash run src/process_data/perturbation/sc_counts/config.vsh.yaml
echo ">> Preprocess perturbation data"
echo "> QC, pseudobulk, and filter"
# viash run src/process_data/perturbation/sc_counts/config.vsh.yaml
python src/process_data/perturbation/sc_counts/script.py
echo "> Normalize counts"
# viash run src/process_data/perturbation/normalization/config.vsh.yaml
python src/process_data/perturbation/normalization/script.py
echo "> Batch correction using scgen"
# viash run src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml
python src/process_data/perturbation/batch_correction_scgen/script.py
echo "> Batch correction using seurat"
viash run src/process_data/perturbation/batch_correction_seurat/config.vsh.yaml



# echo ">> Perturbation data: batch correction" TODO"
Expand All @@ -24,6 +35,7 @@ viash run src/process_data/perturbation/sc_counts/config.vsh.yaml

# echo ">> process supp: TODO"

echo ">> Format multiomics data for R "
echo ">> Extract matrix and annotations from multiome "
viash run src/process_data/multiomics/multiome_matrix/config.vsh.yaml

Expand All @@ -32,7 +44,7 @@ echo ">> Construct rds files from multiomics count matrix and annotations"
viash run src/process_data/multiomics/multiome_r/config.vsh.yaml


echo ">> create test resources "
echo ">> Create test resources"
echo ">> Extract matrix and annotations from multiome: test data "
viash run src/process_data/multiomics/multiome_matrix/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad \
--multiomics_atac resources_test/grn-benchmark/multiomics_atac.h5ad \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
functionality:
name: batch_correction_scgen
info:
label: batch_correction_scgen
summary: "Correct batch effects using scgen"

arguments:
- name: --perturbation_data_n
type: file
info:
label: perturbation
summary: "Perturbation dataset for benchmarking."
file_type: h5ad
slots:
layers:
- name: n_counts
type: double
description: "Pseudobulked values using mean approach"
required: true
- name: pearson
type: double
description: "Normalized values using pearson residuals"
required: true
- name: lognorm
type: double
description: "Normalized values using shifted logarithm "
required: true
required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad
- name: --perturbation_data_bc
type: file
info:
label: perturbation
summary: "Perturbation dataset for benchmarking."
file_type: h5ad
slots:
layers:
- name: n_counts
type: double
description: "Pseudobulked values using mean approach"
required: true
- name: pearson
type: double
description: "Normalized values using pearson residuals"
required: true
- name: lognorm
type: double
description: "Normalized values using shifted logarithm "
required: true
- name: scgen_lognorm
type: double
description: "Batch correction using scgen on lognorm data"
required: true
- name: scgen_pearson
type: double
description: "Batch correction using scgen on pearson data"
required: true
required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad

resources:
- type: python_script
path: script.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [scgen]

- type: native
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import scgen

par = {
'perturbation_data': 'resources/raw-data/perturbation_data.h5ad',
'perturbation_data_n': 'resources/raw-data/perturbation_data.h5ad',
"perturbation_data_bc": 'resources/raw-data/perturbation_data.h5ad'
}


bulk_adata = ad.read_h5ad(par['perturbation_data'])
bulk_adata = ad.read_h5ad(par['perturbation_data_n'])
print(bulk_adata)

for norm_name in ['lognorm', 'pearson']:
Expand All @@ -30,4 +31,4 @@
corrected_adata = model.batch_removal()

bulk_adata.layers[f'scgen_{norm_name}'] = corrected_adata.X
bulk_adata.write_h5ad(par['perturbation_data'])
bulk_adata.write_h5ad(par['perturbation_data_bc'])

This file was deleted.

43 changes: 40 additions & 3 deletions src/process_data/perturbation/normalization/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,46 @@ functionality:

arguments:

- name: --perturbation_data
type: file
required: true
- name: --perturbation_data_f
type: file
info:
label: perturbation
summary: "Perturbation dataset for benchmarking."
file_type: h5ad
slots:
layers:
- name: n_counts
type: double
description: "Pseudobulked values using mean approach"
required: true

required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad

- name: --perturbation_data_n
type: file
info:
label: perturbation
summary: "Perturbation dataset for benchmarking."
file_type: h5ad
slots:
layers:
- name: n_counts
type: double
description: "Pseudobulked values using mean approach"
required: true
- name: pearson
type: double
description: "Normalized values using pearson residuals"
required: true
- name: lognorm
type: double
description: "Normalized values using shifted logarithm "
required: true
required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad


resources:
Expand Down
11 changes: 7 additions & 4 deletions src/process_data/perturbation/normalization/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import scanpy as sc

par = {
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
'perturbation_data_f': 'resources/grn-benchmark/perturbation_data.h5ad',
'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad'
}

def normalize_func(bulk_adata):
Expand All @@ -27,7 +28,9 @@ def normalize_func(bulk_adata):
bulk_adata.layers['lognorm'] = bulk_adata_c.X

return bulk_adata

print("reading the file")
bulk_adata_filtered = ad.read_h5ad(par['perturbation_data_f'])
bulk_adata_n = normalize_func(bulk_adata_filtered)
bulk_adata_n.write(par['perturbation_data'])
print("Normalizing completed")
print("Writing the file")
bulk_adata_n.write(par['perturbation_data_n'])
28 changes: 18 additions & 10 deletions src/process_data/perturbation/sc_counts/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,32 @@ functionality:
label: perturbation_counts
summary: "Porcesses sc counts of perturbation data to generate pseudobulked"
# description: |
# "
# It conducts QC on sc level to remove low quality cell and genes.
# Then, sc counts are pseudobulked and filtered for outliers compounds, samples with low quality cells, and genes with low coverage.
# Finally, it normalized the counts data.
# "

# It conducts QC on sc level to remove low quality cell and genes.
# Then, sc counts are pseudobulked and filtered for outliers compounds, samples with low quality cells, and genes with low coverage.
# Finally, it normalized the counts data.

arguments:
- name: --perturbation_counts
type: file
required: false
direction: input
default: resources/datasets_raw/perturbation_counts.h5ad
default: resources_test/datasets_raw/perturbation_counts.h5ad

- name: --perturbation_data
type: file
- name: --perturbation_data_f
type: file
info:
label: perturbation
summary: "Perturbation dataset for benchmarking."
file_type: h5ad
slots:
layers:
- name: n_counts
type: double
description: "Pseudobulked values using mean approach"
required: true
required: false
direction: output
default: resources/grn-benchmark/perturbation_data.h5ad
default: resources_test/grn-benchmark/perturbation_data.h5ad


resources:
Expand Down
21 changes: 3 additions & 18 deletions src/process_data/perturbation/sc_counts/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

par = {
'sc_counts': 'resources/datasets_raw/perturbation_counts.h5ad',
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
'perturbation_data_f': 'resources/grn-benchmark/perturbation_data.h5ad',
}

def preprocess_sc(par):
Expand Down Expand Up @@ -186,25 +186,10 @@ def filter_func(bulk_adata):
for key in ['cell_type','plate_name']:
bulk_adata_filtered.obs[key] = bulk_adata_filtered.obs[key].astype(str)
return bulk_adata_filtered
def normalize_func(bulk_adata):
# normalize data based on mean pseudobulk
bulk_adata.X = bulk_adata.layers['counts'].copy()

bulk_adata_c = bulk_adata.copy()
sc.experimental.pp.normalize_pearson_residuals(bulk_adata_c)
bulk_adata.layers['pearson'] = bulk_adata_c.X

bulk_adata_c = bulk_adata.copy()
sc.pp.normalize_total(bulk_adata_c)
sc.pp.log1p(bulk_adata_c)
sc.pp.scale(bulk_adata_c)
bulk_adata.layers['lognorm'] = bulk_adata_c.X

return bulk_adata


sc_counts_f = preprocess_sc(par)
bulk_adata = pseudobulk_sum_func(sc_counts_f)
bulk_adata = pseudobulk_mean_func(bulk_adata)
bulk_adata_filtered = filter_func(bulk_adata)
bulk_adata_n = normalize_func(bulk_adata_filtered)
bulk_adata_n.write(par['perturbation_data'])
bulk_adata_filtered.write(par['perturbation_data_f'])

0 comments on commit 078f2ca

Please sign in to comment.