From f95029d8d6a4949ae23b2e2c99249c7807f3c41f Mon Sep 17 00:00:00 2001 From: jalil Date: Wed, 18 Sep 2024 10:08:12 +0200 Subject: [PATCH] benchmark all added --- scripts/run_benchmark_single_omics.sh | 41 +++++----- src/api/comp_method.yaml | 3 +- .../baseline_corr/config.vsh.yaml | 1 - src/control_methods/baseline_corr/script.py | 7 +- src/control_methods/pearson/config.vsh.yaml | 24 ++++++ src/control_methods/pearson/script.py | 76 ++++++++++++++++++ src/control_methods/pearson/test.sh | 5 ++ .../pearson_causal/config.vsh.yaml | 24 ++++++ src/control_methods/pearson_causal/script.py | 77 +++++++++++++++++++ src/control_methods/pearson_causal/test.sh | 5 ++ .../positive_control/config.vsh.yaml | 29 +++++++ .../positive_control/script.py | 74 ++++++++++++++++++ src/control_methods/positive_control/test.sh | 5 ++ .../{config.novsh.yaml => config.vsh.yaml} | 0 src/methods/single_omics/grnboost2/script.py | 4 +- src/methods/single_omics/portia/script.py | 2 +- src/metrics/regression_1/script.py | 6 +- .../config.vsh.yaml | 21 +++-- .../main.nf | 17 +++- 19 files changed, 384 insertions(+), 37 deletions(-) create mode 100644 src/control_methods/pearson/config.vsh.yaml create mode 100644 src/control_methods/pearson/script.py create mode 100644 src/control_methods/pearson/test.sh create mode 100644 src/control_methods/pearson_causal/config.vsh.yaml create mode 100644 src/control_methods/pearson_causal/script.py create mode 100644 src/control_methods/pearson_causal/test.sh create mode 100644 src/control_methods/positive_control/config.vsh.yaml create mode 100644 src/control_methods/positive_control/script.py create mode 100644 src/control_methods/positive_control/test.sh rename src/methods/multi_omics/celloracle/{config.novsh.yaml => config.vsh.yaml} (100%) rename src/workflows/{run_benchmark_single_omics => run_benchmark_all}/config.vsh.yaml (84%) rename src/workflows/{run_benchmark_single_omics => run_benchmark_all}/main.nf (94%) diff --git a/scripts/run_benchmark_single_omics.sh b/scripts/run_benchmark_single_omics.sh index 0547c906d..8d731a0cb 100644 --- a/scripts/run_benchmark_single_omics.sh +++ b/scripts/run_benchmark_single_omics.sh @@ -2,8 +2,8 @@ # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" RUN_ID="single_omics_inference" -# resources_dir="./resources_test/" -resources_dir="s3://openproblems-data/resources/grn" +resources_dir="./resources_test/" +# resources_dir="s3://openproblems-data/resources/grn" publish_dir="${resources_dir}/results/${RUN_ID}" @@ -11,9 +11,10 @@ reg_type=ridge subsample=-2 max_workers=10 layer='scgen_pearson' -metric_ids="[regression_1, regression_2]" +metric_ids="[regression_1]" +cell_type_specific=true #for controls # method_ids="[tigress, ennet, scsgl, pidc]" -method_ids="[scenic]" +method_ids="[pearson_corr, pearson_causal, positive_control]" param_file="./params/${RUN_ID}.yaml" @@ -24,24 +25,26 @@ param_list: metric_ids: $metric_ids method_ids: $method_ids perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad - multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_0.h5ad + multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad + multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac.h5ad reg_type: $reg_type subsample: $subsample max_workers: $max_workers layer: $layer consensus: ${resources_dir}/prior/consensus-num-regulators.json tf_all: ${resources_dir}/prior/tf_all.csv + cell_type_specific: ${cell_type_specific} output_state: "state.yaml" publish_dir: "$publish_dir" HERE -# nextflow run . \ -# -main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \ -# -profile docker \ -# -with-trace \ -# -c src/common/nextflow_helpers/labels_ci.config \ -# -params-file ${param_file} +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \ + -profile docker \ + -with-trace \ + -c src/common/nextflow_helpers/labels_ci.config \ + -params-file ${param_file} # ./tw-windows-x86_64.exe launch ` # https://github.com/openproblems-bio/task_grn_inference.git ` @@ -53,11 +56,11 @@ HERE # --params-file ./params/single_omics_inference.yaml ` # --config src/common/nextflow_helpers/labels_tw.config -./tw launch https://github.com/openproblems-bio/task_grn_inference \ - --revision build/main \ - --pull-latest \ - --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file ${param_file} \ - --config src/common/nextflow_helpers/labels_tw.config +# ./tw launch https://github.com/openproblems-bio/task_grn_inference \ +# --revision build/main \ +# --pull-latest \ +# --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \ +# --workspace 53907369739130 \ +# --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ +# --params-file ${param_file} \ +# --config src/common/nextflow_helpers/labels_tw.config diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 2771b58c8..c206708c1 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -41,7 +41,8 @@ functionality: - name: --cell_type_specific type: boolean direction: input - default: false + default: true + diff --git a/src/control_methods/baseline_corr/config.vsh.yaml b/src/control_methods/baseline_corr/config.vsh.yaml index ef2390a53..6d96624f3 100644 --- a/src/control_methods/baseline_corr/config.vsh.yaml +++ b/src/control_methods/baseline_corr/config.vsh.yaml @@ -7,7 +7,6 @@ functionality: label: baseline_corr summary: "Baseline based on correlation" arguments: - - name: --causal type: boolean direction: input diff --git a/src/control_methods/baseline_corr/script.py b/src/control_methods/baseline_corr/script.py index 42488e13f..e4d081219 100644 --- a/src/control_methods/baseline_corr/script.py +++ b/src/control_methods/baseline_corr/script.py @@ -51,7 +51,8 @@ def corr_net(X, gene_names, par): return net def create_corr_net(X, gene_names, groups, par): - if par['cell_type_specific']: + # if par['cell_type_specific']: + if True: i = 0 for group in tqdm(np.unique(groups), desc="Processing groups"): X_sub = X[groups == group, :] @@ -96,7 +97,9 @@ def create_meta_cells(df, n_cells=15): tf_all = np.intersect1d(tf_all, gene_names) print('Noramlize data') -multiomics_rna.X = multiomics_rna.layers['lognorm'] +sc.pp.normalize_total(multiomics_rna) +sc.pp.log1p(multiomics_rna) +sc.pp.scale(multiomics_rna) if par['impute']: print("imputing") diff --git a/src/control_methods/pearson/config.vsh.yaml b/src/control_methods/pearson/config.vsh.yaml new file mode 100644 index 000000000..3a874d82c --- /dev/null +++ b/src/control_methods/pearson/config.vsh.yaml @@ -0,0 +1,24 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: pearson_corr + namespace: control_methods + info: + label: pearson_corr + summary: "Baseline based on correlation" + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + # packages: [ magic-impute ] + packages: [ ] + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/pearson/script.py b/src/control_methods/pearson/script.py new file mode 100644 index 000000000..6a01094b6 --- /dev/null +++ b/src/control_methods/pearson/script.py @@ -0,0 +1,76 @@ +import os +import pandas as pd +import numpy as np +import anndata as ad +import scanpy as sc +from tqdm import tqdm +from scipy.stats import spearmanr +from sklearn.preprocessing import StandardScaler + +## VIASH START +par = { + 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad', + 'tf_all': 'resources/prior/tf_all.csv', + 'causal': False, + 'cell_type_specific': True, + 'max_n_links': 50000, + 'prediction': 'resources/grn_models/donor_0_default/pearson.csv', + "seed": 32 +} +## VIASH END +print(par) + +def process_links(net, par): + net = net[net.source!=net.target] + net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) + net = net_sorted.head(par['max_n_links']).reset_index(drop=True) + return net + +def corr_net(X, gene_names, par): + X = StandardScaler().fit_transform(X) + net = np.dot(X.T, X) / X.shape[0] + net = pd.DataFrame(net, index=gene_names, columns=gene_names) + net = net.sample(len(tf_all), axis=1, random_state=par['seed']) + net = net.reset_index() + index_name = net.columns[0] + net = net.melt(id_vars=index_name, var_name='source', value_name='weight') + + net.rename(columns={index_name: 'target'}, inplace=True) + net = process_links(net, par) + + return net + +def create_corr_net(X, gene_names, groups, par): + if par['cell_type_specific']: + i = 0 + for group in tqdm(np.unique(groups), desc="Processing groups"): + X_sub = X[groups == group, :] + net = corr_net(X_sub, gene_names, par) + net['cell_type'] = group + if i==0: + grn = net + else: + grn = pd.concat([grn, net], axis=0).reset_index(drop=True) + i += 1 + else: + grn = corr_net(X, gene_names, par) + return grn +print('Read data') +multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) + + +gene_names = multiomics_rna.var_names.to_numpy() +tf_all = np.loadtxt(par['tf_all'], dtype=str) +groups = multiomics_rna.obs.cell_type +tf_all = np.intersect1d(tf_all, gene_names) + +print('Noramlize data') +sc.pp.normalize_total(multiomics_rna) +sc.pp.log1p(multiomics_rna) +sc.pp.scale(multiomics_rna) + +print('Create corr net') +net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par) + +print('Output GRN') +net.to_csv(par['prediction']) diff --git a/src/control_methods/pearson/test.sh b/src/control_methods/pearson/test.sh new file mode 100644 index 000000000..884e70266 --- /dev/null +++ b/src/control_methods/pearson/test.sh @@ -0,0 +1,5 @@ +viash run src/control_methods/baseline_corr/config.vsh.yaml -- \ + --prediction output/baseline_corr.csv \ + --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ + --tf_all resources/prior/tf_all.csv \ + --causal true diff --git a/src/control_methods/pearson_causal/config.vsh.yaml b/src/control_methods/pearson_causal/config.vsh.yaml new file mode 100644 index 000000000..b6f3f7fe9 --- /dev/null +++ b/src/control_methods/pearson_causal/config.vsh.yaml @@ -0,0 +1,24 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: pearson_causal + namespace: control_methods + info: + label: pearson_causal + summary: "Baseline based on correlation" + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + # packages: [ magic-impute ] + packages: [ ] + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/pearson_causal/script.py b/src/control_methods/pearson_causal/script.py new file mode 100644 index 000000000..14fbc349a --- /dev/null +++ b/src/control_methods/pearson_causal/script.py @@ -0,0 +1,77 @@ +import os +import pandas as pd +import numpy as np +import anndata as ad +import scanpy as sc +from tqdm import tqdm +from scipy.stats import spearmanr +from sklearn.preprocessing import StandardScaler + +## VIASH START +par = { + 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad', + 'tf_all': 'resources/prior/tf_all.csv', + 'causal': True, + 'cell_type_specific': True, + 'max_n_links': 50000, + 'prediction': 'resources/grn_models/donor_0_default/pearson_causal.csv', + "seed": 32 +} +## VIASH END +print(par) + +def process_links(net, par): + net = net[net.source!=net.target] + net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) + net = net_sorted.head(par['max_n_links']).reset_index(drop=True) + return net + +def corr_net(X, gene_names, par): + X = StandardScaler().fit_transform(X) + net = np.dot(X.T, X) / X.shape[0] + net = pd.DataFrame(net, index=gene_names, columns=gene_names) + + net = net[tf_all] + net = net.reset_index() + index_name = net.columns[0] + net = net.melt(id_vars=index_name, var_name='source', value_name='weight') + + net.rename(columns={index_name: 'target'}, inplace=True) + net = process_links(net, par) + + return net + +def create_corr_net(X, gene_names, groups, par): + if par['cell_type_specific']: + i = 0 + for group in tqdm(np.unique(groups), desc="Processing groups"): + X_sub = X[groups == group, :] + net = corr_net(X_sub, gene_names, par) + net['cell_type'] = group + if i==0: + grn = net + else: + grn = pd.concat([grn, net], axis=0).reset_index(drop=True) + i += 1 + else: + grn = corr_net(X, gene_names, par) + return grn +print('Read data') +multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) + + +gene_names = multiomics_rna.var_names.to_numpy() +tf_all = np.loadtxt(par['tf_all'], dtype=str) +groups = multiomics_rna.obs.cell_type +tf_all = np.intersect1d(tf_all, gene_names) + +print('Noramlize data') +sc.pp.normalize_total(multiomics_rna) +sc.pp.log1p(multiomics_rna) +sc.pp.scale(multiomics_rna) + +print('Create corr net') +net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par) + +print('Output GRN') +net.to_csv(par['prediction']) diff --git a/src/control_methods/pearson_causal/test.sh b/src/control_methods/pearson_causal/test.sh new file mode 100644 index 000000000..884e70266 --- /dev/null +++ b/src/control_methods/pearson_causal/test.sh @@ -0,0 +1,5 @@ +viash run src/control_methods/baseline_corr/config.vsh.yaml -- \ + --prediction output/baseline_corr.csv \ + --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ + --tf_all resources/prior/tf_all.csv \ + --causal true diff --git a/src/control_methods/positive_control/config.vsh.yaml b/src/control_methods/positive_control/config.vsh.yaml new file mode 100644 index 000000000..943d48e1c --- /dev/null +++ b/src/control_methods/positive_control/config.vsh.yaml @@ -0,0 +1,29 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: positive_control + namespace: control_methods + info: + label: positive_control + summary: "Baseline based on correlation" + arguments: + - name: --perturbation_data + type: file + required: true + direction: input + example: resources_test/grn-benchmark/perturbation_data.h5ad + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + # packages: [ magic-impute ] + packages: [ ] + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/control_methods/positive_control/script.py b/src/control_methods/positive_control/script.py new file mode 100644 index 000000000..2711cea83 --- /dev/null +++ b/src/control_methods/positive_control/script.py @@ -0,0 +1,74 @@ +import os +import pandas as pd +import numpy as np +import anndata as ad +import scanpy as sc +from tqdm import tqdm +from scipy.stats import spearmanr +from sklearn.preprocessing import StandardScaler + +## VIASH START +par = { + 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', + 'tf_all': 'resources/prior/tf_all.csv', + 'causal': True, + 'cell_type_specific': True, + 'max_n_links': 50000, + 'prediction': 'resources/grn_models/positive_control.csv', + "seed": 32 +} +## VIASH END +print(par) + +def process_links(net, par): + net = net[net.source!=net.target] + net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) + net = net_sorted.head(par['max_n_links']).reset_index(drop=True) + return net + +def corr_net(X, gene_names, par): + X = StandardScaler().fit_transform(X) + net = np.dot(X.T, X) / X.shape[0] + net = pd.DataFrame(net, index=gene_names, columns=gene_names) + + net = net[tf_all] + net = net.reset_index() + index_name = net.columns[0] + net = net.melt(id_vars=index_name, var_name='source', value_name='weight') + + net.rename(columns={index_name: 'target'}, inplace=True) + net = process_links(net, par) + + return net + +def create_corr_net(X, gene_names, groups, par): + if par['cell_type_specific']: + i = 0 + for group in tqdm(np.unique(groups), desc="Processing groups"): + X_sub = X[groups == group, :] + net = corr_net(X_sub, gene_names, par) + net['cell_type'] = group + if i==0: + grn = net + else: + grn = pd.concat([grn, net], axis=0).reset_index(drop=True) + i += 1 + else: + grn = corr_net(X, gene_names, par) + return grn +print('Read data') +multiomics_rna = ad.read_h5ad(par["perturbation_data"]) + + +gene_names = multiomics_rna.var_names.to_numpy() +tf_all = np.loadtxt(par['tf_all'], dtype=str) +groups = multiomics_rna.obs.cell_type +tf_all = np.intersect1d(tf_all, gene_names) + + + +print('Create corr net') +net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par) + +print('Output GRN') +net.to_csv(par['prediction']) diff --git a/src/control_methods/positive_control/test.sh b/src/control_methods/positive_control/test.sh new file mode 100644 index 000000000..884e70266 --- /dev/null +++ b/src/control_methods/positive_control/test.sh @@ -0,0 +1,5 @@ +viash run src/control_methods/baseline_corr/config.vsh.yaml -- \ + --prediction output/baseline_corr.csv \ + --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ + --tf_all resources/prior/tf_all.csv \ + --causal true diff --git a/src/methods/multi_omics/celloracle/config.novsh.yaml b/src/methods/multi_omics/celloracle/config.vsh.yaml similarity index 100% rename from src/methods/multi_omics/celloracle/config.novsh.yaml rename to src/methods/multi_omics/celloracle/config.vsh.yaml diff --git a/src/methods/single_omics/grnboost2/script.py b/src/methods/single_omics/grnboost2/script.py index 8aeaa5e75..42854c4b1 100644 --- a/src/methods/single_omics/grnboost2/script.py +++ b/src/methods/single_omics/grnboost2/script.py @@ -26,7 +26,7 @@ def process_links(net, par): return net # Load scRNA-seq data adata_rna = anndata.read_h5ad(par['multiomics_rna']) -adata_rna.X = adata_rna.layers['lognorm'] #TODO: fix this +# adata_rna.X = adata_rna.layers['lognorm'] #TODO: fix this groups = adata_rna.obs.cell_type gene_names = adata_rna.var.gene_ids.index.to_numpy() X = adata_rna.X @@ -48,7 +48,7 @@ def infer_grn(X, par): return network - +# par['cell_type_specific'] = False if par['cell_type_specific']: i = 0 for group in tqdm(np.unique(groups), desc="Processing groups"): diff --git a/src/methods/single_omics/portia/script.py b/src/methods/single_omics/portia/script.py index a17104ce6..8abf51aec 100644 --- a/src/methods/single_omics/portia/script.py +++ b/src/methods/single_omics/portia/script.py @@ -52,7 +52,7 @@ def infer_grn(X, gene_names): grn = process_links(grn, par) return grn - +# par['cell_type_specific'] = False if par['cell_type_specific']: groups = adata_rna.obs.cell_type i = 0 diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py index 40ecfe830..4d1ef6183 100644 --- a/src/metrics/regression_1/script.py +++ b/src/metrics/regression_1/script.py @@ -19,9 +19,9 @@ 'max_workers': 4, } ## VIASH END -meta = { - "resources_dir":'src/metrics/regression_1/' -} +# meta = { +# "resources_dir":'src/metrics/regression_1/' +# } sys.path.append(meta["resources_dir"]) from main import main diff --git a/src/workflows/run_benchmark_single_omics/config.vsh.yaml b/src/workflows/run_benchmark_all/config.vsh.yaml similarity index 84% rename from src/workflows/run_benchmark_single_omics/config.vsh.yaml rename to src/workflows/run_benchmark_all/config.vsh.yaml index 70214a66f..f1f13c087 100644 --- a/src/workflows/run_benchmark_single_omics/config.vsh.yaml +++ b/src/workflows/run_benchmark_all/config.vsh.yaml @@ -1,9 +1,9 @@ functionality: - name: run_benchmark_single_omics + name: run_benchmark namespace: "workflows" info: - label: run_benchmark_single_omics + label: run_benchmark summary: "Infer GRNs and evaluate each prediction for single omics grns" argument_groups: @@ -12,9 +12,9 @@ functionality: - name: --multiomics_rna type: file direction: input - # - name: --multiomics_atac - # type: file - # direction: input + - name: --multiomics_atac + type: file + direction: input - name: --perturbation_data type: file direction: input @@ -49,6 +49,12 @@ functionality: required: false direction: input default: pearson + - name: --cell_type_specific + type: boolean + required: false + direction: input + default: true + - name: Outputs arguments: @@ -92,8 +98,13 @@ functionality: - name: grn_methods/ppcor # - name: grn_methods/scsgl # - name: grn_methods/tigress + - name: grn_methods/celloracle - name: grn_methods/scgpt - name: grn_methods/scenic + - name: control_methods/pearson_corr + - name: control_methods/pearson_causal + - name: control_methods/negative_control + - name: control_methods/positive_control repositories: - name: openproblems type: github diff --git a/src/workflows/run_benchmark_single_omics/main.nf b/src/workflows/run_benchmark_all/main.nf similarity index 94% rename from src/workflows/run_benchmark_single_omics/main.nf rename to src/workflows/run_benchmark_all/main.nf index 79030a692..d653e39ef 100644 --- a/src/workflows/run_benchmark_single_omics/main.nf +++ b/src/workflows/run_benchmark_all/main.nf @@ -19,7 +19,14 @@ workflow run_wf { grnboost2, ppcor, scgpt, - scenic + scenic, + + pearson_corr, + pearson_causal, + negative_control, + positive_control, + + celloracle ] @@ -47,7 +54,6 @@ workflow run_wf { // ] // } // ) - /*************************** * RUN METHODS AND METRICS * ***************************/ @@ -74,7 +80,11 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ multiomics_rna: "multiomics_rna", + multiomics_atac: "multiomics_atac", tf_all: "tf_all", + perturbation_data:"perturbation_data", + cell_type_specific:"cell_type_specific" + ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp -> @@ -105,7 +115,8 @@ workflow run_wf { max_workers: "max_workers", consensus: "consensus", tf_all: "tf_all", - layer:"layer" + layer:"layer", + cell_type_specific:"cell_type_specific" ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp ->