From dd969e0eed413b90e2b935e6de09cf738542879d Mon Sep 17 00:00:00 2001 From: jalil Date: Fri, 30 Aug 2024 10:07:49 +0200 Subject: [PATCH] single omics workflow updated --- scripts/run_benchmark_single_omics.sh | 10 ++++----- src/api/comp_metric.yaml | 4 ++++ .../single_omics/tigress/config.vsh.yaml | 2 +- src/metrics/regression_1/config.vsh.yaml | 5 +++++ src/metrics/regression_1/main.py | 22 +++++++++++-------- src/metrics/regression_2/main.py | 5 +++-- src/process_data/test_data/config.vsh.yaml | 12 +++++----- src/process_data/test_data/script.py | 2 +- .../run_benchmark_single_omics/main.nf | 7 ++++-- 9 files changed, 43 insertions(+), 26 deletions(-) diff --git a/scripts/run_benchmark_single_omics.sh b/scripts/run_benchmark_single_omics.sh index a0421b5c1..88f9ec408 100644 --- a/scripts/run_benchmark_single_omics.sh +++ b/scripts/run_benchmark_single_omics.sh @@ -1,12 +1,12 @@ #!/bin/bash # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -RUN_ID="single_omics_try1" -resources_dir="s3://openproblems-data/resources/grn" -publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" +RUN_ID="single_omics" +# resources_dir="s3://openproblems-data/resources_test/grn" +# publish_dir="s3://openproblems-data/resources_test/grn/results/${RUN_ID}" -# resources_dir="./resources_test/" -# publish_dir="output/${RUN_ID}" +resources_dir="./resources_test/" +publish_dir="output/${RUN_ID}" reg_type=ridge subsample=-2 diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index bef57c5e0..e88a1b6ca 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -51,6 +51,10 @@ functionality: type: file direction: input default: 'resources/prior/tf_all.csv' + - name: --apply_tf + type: boolean + required: false + default: true diff --git a/src/methods/single_omics/tigress/config.vsh.yaml b/src/methods/single_omics/tigress/config.vsh.yaml index d88d6b5ec..3d0be3eef 100644 --- a/src/methods/single_omics/tigress/config.vsh.yaml +++ b/src/methods/single_omics/tigress/config.vsh.yaml @@ -26,4 +26,4 @@ platforms: - type: native - type: nextflow directives: - label: [midtime,midmem,midcpu] + label: [midtime, midmem, highcpu] diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml index 2cc386017..008d43f9e 100644 --- a/src/metrics/regression_1/config.vsh.yaml +++ b/src/metrics/regression_1/config.vsh.yaml @@ -13,6 +13,11 @@ functionality: direction: input required: false default: pearson + - name: --min_tf + type: integer + direction: input + description: calculate the scores for the given min tfs in addition to the default + required: false resources: - type: python_script path: script.py diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index 656e527aa..48781be5c 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -177,7 +177,8 @@ def main(par): gene_names = perturbation_data.var.index.to_numpy() net = pd.read_csv(par['prediction']) # subset to keep only those links with source as tf - net = net[net.source.isin(tf_all)] + if par['apply_tf']: + net = net[net.source.isin(tf_all)] subsample = par['subsample'] reg_type = par['reg_type'] @@ -210,13 +211,15 @@ def main(par): net_processed = process_net(net.copy(), gene_names, manipulate) print(f'Compute metrics for layer: {layer}', flush=True) + tfs_cases = [-1] + if par['min_tf']: + tfs_cases += par['min_tf'] layer_results = {} # Store results for this layer - for exclude_missing_genes in [False]: # two settings on target gene - for tf_n in [-1]: # two settings on tfs + for exclude_missing_genes in [False, True]: # two settings on target gene + for tf_n in tfs_cases: # two settings on tfs run_key = f'ex({exclude_missing_genes})_tf({tf_n})' print(run_key) net_subset = net_processed.copy() - # Subset TFs if tf_n == -1: degrees = net_subset.abs().sum(axis=0) @@ -234,11 +237,12 @@ def main(par): # Convert results to DataFrame df_results = pd.DataFrame(layer_results) - # if 'ex(True)_tf(140)' not in df_results.columns: - # df_results['ex(True)_tf(140)'] = df_results['ex(True)_tf(-1)'] - # if 'ex(False)_tf(140)' not in df_results.columns: - # df_results['ex(False)_tf(140)'] = df_results['ex(False)_tf(-1)'] - + if par['min_tf']: + if 'ex(True)_tf(140)' not in df_results.columns: + df_results['ex(True)_tf(140)'] = df_results['ex(True)_tf(-1)'] + if 'ex(False)_tf(140)' not in df_results.columns: + df_results['ex(False)_tf(140)'] = df_results['ex(False)_tf(-1)'] + df_results['Mean'] = df_results.mean(axis=1) return df_results \ No newline at end of file diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index d227af665..a46fb82ba 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -277,8 +277,9 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: n_features_theta_max = np.asarray([data[gene_name]['1'] for gene_name in gene_names], dtype=int) # Load list of putative TFs - df = pd.read_csv(par['tf_all'], header=None, names=['gene_name']) - tf_names = set(list(df['gene_name'].to_numpy())) + tf_names = np.loadtxt(par['tf_all'], dtype=str) + if par['apply_tf']==False: + tf_names = gene_names # Evaluate GRN print(f'Compute metrics for layer: {layer}', flush=True) diff --git a/src/process_data/test_data/config.vsh.yaml b/src/process_data/test_data/config.vsh.yaml index ae82bad3c..134faf1c2 100644 --- a/src/process_data/test_data/config.vsh.yaml +++ b/src/process_data/test_data/config.vsh.yaml @@ -9,33 +9,33 @@ functionality: arguments: - name: --multiomics_rna type: file - required: true + required: false direction: input default: resources/grn-benchmark/multiomics_rna.h5ad - name: --multiomics_rna_test type: file - required: true + required: false direction: output default: resources_test/grn-benchmark/multiomics_rna.h5ad - name: --multiomics_atac type: file - required: true + required: false direction: input default: resources/grn-benchmark/multiomics_atac.h5ad - name: --multiomics_atac_test type: file - required: true + required: false direction: input default: resources_test/grn-benchmark/multiomics_atac.h5ad - name: --perturbation_data type: file - required: true + required: false direction: input default: resources/grn-benchmark/perturbation_data.h5ad - name: --perturbation_data_test type: file - required: true + required: false direction: output default: resources_test/grn-benchmark/perturbation_data.h5ad resources: diff --git a/src/process_data/test_data/script.py b/src/process_data/test_data/script.py index 923d7ad74..c4616bba3 100644 --- a/src/process_data/test_data/script.py +++ b/src/process_data/test_data/script.py @@ -68,4 +68,4 @@ # shorten perturbation adata_bulk = ad.read_h5ad(par['perturbation_data']) -adata_bulk[:200, adata_bulk.var_names.isin(adata_rna_s.var_names)].write(par['perturbation_data_test']) \ No newline at end of file +adata_bulk[:600, adata_bulk.var_names.isin(adata_rna_s.var_names)].write(par['perturbation_data_test']) \ No newline at end of file diff --git a/src/workflows/run_benchmark_single_omics/main.nf b/src/workflows/run_benchmark_single_omics/main.nf index fb647b76c..d81dad698 100644 --- a/src/workflows/run_benchmark_single_omics/main.nf +++ b/src/workflows/run_benchmark_single_omics/main.nf @@ -25,14 +25,17 @@ workflow run_wf { methods = [ portia, - pidc, + ennet, + grnboost2, + scsgl, ppcor, tigress ] // construct list of metrics metrics = [ - regression_1 + regression_1, + regression_2 ] /****************************