worflow bug fixed

openproblems-bio · Nov 28, 2024 · b76d6c0 · b76d6c0
1 parent 4cd5eac
commit b76d6c0
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 63 deletions.
diff --git a/scripts/run_benchmark_all.sh b/scripts/run_benchmark_all.sh
@@ -1,18 +1,17 @@
 #!/bin/bash
 
-RUN_ID="d0_hvgs_baseline"
+dataset="op"
+RUN_ID=${dataset}
 # resources_dir="./resources/"
 resources_dir="s3://openproblems-data/resources/grn"
 publish_dir="${resources_dir}/results/${RUN_ID}"
 
 reg_type=ridge
-subsample=-2
+subsample=-1
 num_workers=10
-layer='scgen_pearson'
+layer='X_norm'
 metric_ids="[regression_1, regression_2]"
-cell_type_specific=false #for controls
-normalize=false
-method_ids="[pearson_corr, positive_control]"
+method_ids="[negative_control, pearson_corr, positive_control, portia, scgpt]"
 
 param_file="./params/${RUN_ID}.yaml"
 
@@ -22,17 +21,15 @@ param_list:
   - id: ${reg_type}
     metric_ids: $metric_ids
     method_ids: $method_ids
-    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
-    multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_d0_hvg.h5ad
-    multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac_d0.h5ad
+    evaluation_data: ${resources_dir}/evaluation_datasets/${dataset}_perturbation.h5ad
+    rna: ${resources_dir}/inference_datasets/${dataset}_rna.h5ad
+    atac: ${resources_dir}/inference_datasets/${dataset}_atac.h5ad
     reg_type: $reg_type
     subsample: $subsample
     num_workers: $num_workers
     layer: $layer
-    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    consensus: ${resources_dir}/prior/${dataset}_consensus-num-regulators.json
     tf_all: ${resources_dir}/prior/tf_all.csv
-    cell_type_specific: ${cell_type_specific}
-    normalize: ${normalize}
 
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
@@ -60,6 +57,6 @@ HERE
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \
-  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --compute-env 5DwwhQoBi0knMSGcwThnlF \
   --params-file ${param_file} \
   --config src/common/nextflow_helpers/labels_tw.config
diff --git a/...ethods/multi_omics/figr/config.novsh.yaml → src/methods/multi_omics/figr/config.vsh.yaml b/...ethods/multi_omics/figr/config.novsh.yaml → src/methods/multi_omics/figr/config.vsh.yaml
diff --git a/...hods/multi_omics/granie/config.novsh.yaml → ...ethods/multi_omics/granie/config.vsh.yaml b/...hods/multi_omics/granie/config.novsh.yaml → ...ethods/multi_omics/granie/config.vsh.yaml
diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py
@@ -5,39 +5,36 @@
 import os 
 
 
-def define_par(dataset):
+def define_par(dataset, global_models=False):
 
   par = {
       'reg_type': 'ridge',
       'models_dir': f"resources/grn_models/{dataset}",
       'scores_dir': f"output/temp/{dataset}",
 
-      'models': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
+      'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
 
-      # 'models': [ 'positive_control', 'pearson_corr'],
       'global_models': [
-                        'ANANSE_tissue/networks/lung.parquet',
-                        'ANANSE_tissue/networks/stomach.parquet', 
-                        'ANANSE_tissue/networks/heart.parquet',
-                        'ANANSE_tissue/networks/bone_marrow.parquet',
-
-                        'gtex_rna/networks/Whole_Blood.parquet',
-                        'gtex_rna/networks/Brain_Amygdala.parquet', 
-                        'gtex_rna/networks/Breast_Mammary_Tissue.parquet', 
-                        'gtex_rna/networks/Lung.parquet',
-                        'gtex_rna/networks/Stomach.parquet',
-
-
-                        'cellnet_human_Hg1332/networks/bcell.parquet',
-                        'cellnet_human_Hg1332/networks/tcell.parquet',
-                        'cellnet_human_Hg1332/networks/skin.parquet',
-                        'cellnet_human_Hg1332/networks/neuron.parquet',
-                        'cellnet_human_Hg1332/networks/heart.parquet',
+                        'collectri',
+                        'Ananse:Lung',
+                        'Ananse:Stomach',
+                        'Ananse:Heart',
+                        'Ananse:Bone marrow',
+                        'Gtex:Whole blood',
+                        'Gtex:Brain amygdala',
+                        'Gtex:Breast mammary tissue',
+                        'Gtex:Lung',
+                        'Gtex:Stomach',
+                        'Cellnet:Bcell',
+                        'Cellnet:Tcell',
+                        'Cellnet:Skin',
+                        'Cellnet:Neuron',
+                        'Cellnet:Heart'
                         ],
-      'global_models_dir': '../eric/network_collection/networks/',
+      'global_models_dir': 'resources/grn_models/global/',
 
       "evaluation_data": f"resources/evaluation_datasets/{dataset}_perturbation.h5ad",
-      'consensus': f'resources/prior/{dataset}_consensus-num-regulators.json',
+      'consensus':  f'resources/prior/{dataset}_consensus-num-regulators.json',
 
       'layer': 'X_norm',
 
@@ -48,6 +45,32 @@ def define_par(dataset):
       'verbose': 4,
       'num_workers': 20
   }
+  if global_models:
+    import shutil
+
+    temp_grn_dir = 'output/models/'
+    os.makedirs(temp_grn_dir, exist_ok=True)
+
+    grn_file_list = []
+    for model in par['global_models']:
+        grn_file = f"{par['global_models_dir']}/{model}.csv"
+        grn_file_list.append(grn_file)
+
+    for model in par['models']:
+        grn_file = f"{par['models_dir']}/{model}.csv"
+        grn_file_list.append(grn_file)
+
+    par['models'] = par['models'] + par['global_models']
+    par['models_dir'] = temp_grn_dir
+    par['consensus'] = f'{temp_grn_dir}/{dataset}_consensus-num-regulators.json'
+    for grn_file in grn_file_list:
+      try:
+          shutil.copy(grn_file, temp_grn_dir)
+          print(f"Copied {grn_file} to {temp_grn_dir}")
+      except FileNotFoundError:
+          print(f"File not found: {grn_file}")
+      except Exception as e:
+          print(f"Error copying {grn_file}: {e}")
   return par
 
 
@@ -66,45 +89,31 @@ def define_par(dataset):
 from consensus.script import main as main_consensus
 
 # - run general models
-global_models = False
+global_models = True
 
 # - run metrics 
-for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson'
+for dataset in ['op']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson'
   print('------ ', dataset, '------')
   par = define_par(dataset)
   os.makedirs(par['scores_dir'], exist_ok=True)
+  par = define_par(dataset, global_models=global_models)
   main_consensus(par)
   for binarize in [True]:
     par['binarize'] = binarize
-    for max_n_links in [10000]:
+    for max_n_links in [50000]:
       par['max_n_links'] = max_n_links
       for apply_skeleton in [False]:
         par['apply_skeleton'] = apply_skeleton
         # - determines models to run 
         grn_files_dict = {}
-        # - add global models
-        if global_models:
-          for model in par['global_models']:
-            temp_dir = f"{par['scores_dir']}/nets/"
-            os.makedirs(temp_dir, exist_ok=True)
-            net = pd.read_parquet(f"{par['global_models_dir']}/{model}")
-            net.columns = ['source','target','weight']
-            net = process_links(net, par)
-            if par['binarize']:
-                net['weight'] = net['weight'].apply(binarize_weight) 
-            model = model.replace('/','_')
-            grn_file = f'{temp_dir}/{model}.csv'
-            net.to_csv(grn_file)
-            grn_files_dict[model] = grn_file
-        else:
-          # - add actual models
-          for model in par['models']:
-            print(model)
-            grn_file = f"{par['models_dir']}/{model}.csv"
-            if not os.path.exists(grn_file):
-              print(f"{grn_file} doesnt exist. Skipped.")
-              continue
-            grn_files_dict[model] = grn_file
+        # - add models
+        for model in par['models']:
+          print(model)
+          grn_file = f"{par['models_dir']}/{model}.csv"
+          if not os.path.exists(grn_file):
+            print(f"{grn_file} doesnt exist. Skipped.")
+            continue
+          grn_files_dict[model] = grn_file
 
         # - actual runs 
         i = 0

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -95,6 +95,9 @@ functionality:
     # ---- multiomics 
     - name: grn_methods/celloracle
     - name: grn_methods/scglue
+    - name: grn_methods/figr
+    - name: grn_methods/scenicplus
+    - name: grn_methods/granie
     # ---- baselines
     - name: control_methods/pearson_corr
     - name: control_methods/negative_control

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -14,7 +14,6 @@ workflow run_wf {
   // construct list of methods
   methods = [
     portia,
-    genie3,
     grnboost2,
     ppcor,
     scenic,
@@ -26,7 +25,10 @@ workflow run_wf {
     positive_control,
 
     celloracle,
-    scglue
+    scglue,
+    granie,
+    figr,
+    scenicplus
   ]