baseline corr updated

openproblems-bio · Sep 13, 2024 · 4a35f67 · 4a35f67
1 parent 85c6435
commit 4a35f67
Show file tree

Hide file tree

Showing 10 changed files with 184 additions and 137 deletions.
diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
@@ -2,6 +2,7 @@
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
 # reg_type=${1} #GB, ridge
+viash ns build --parallel
 reg_type=ridge
 
 RUN_ID="grn_evaluation_all_${reg_type}"
@@ -80,15 +81,14 @@ HERE
 # done
 
 ## controls
-# append_entry_control "negative_control" "False" ""
-# append_entry_control "positive_control" "False" ""
-# append_entry_control "baseline_pearson" "False" "pearson"
+# append_entry_control "negative_control" "" "" "false" "false" "false"
+# append_entry_control "positive_control" "" "" "false" "false" "false"
+append_entry_control "baseline_pearson" "False" "pearson" "false" "false" "false"
 append_entry_control "baseline_dotproduct" "False" "dotproduct" "false" "false" "false"
-# append_entry_control "baseline_pearson_causal" "True" "pearson" 
 append_entry_control "baseline_dotproduct_causal" "True" "dotproduct" "false" "false" "false"
-# append_entry_control "baseline_dotproduct_causal_cell_type" "True" "dotproduct" "true"
-# append_entry_control "baseline_dotproduct_causal_metacell" "True" "dotproduct" "false" "true"
-# append_entry_control "baseline_dotproduct_causal_impute" "True" "dotproduct" "false" "false" "true"
+append_entry_control "baseline_dotproduct_causal_cell_type" "True" "dotproduct" "true" "false" "false"
+append_entry_control "baseline_dotproduct_causal_metacell" "True" "dotproduct" "false" "true" "false"
+append_entry_control "baseline_dotproduct_causal_impute" "True" "dotproduct" "false" "false" "true"
 # append_entry_control "baseline_corr_causal_spearman" "True" "spearman"
 
 
@@ -104,6 +104,7 @@ nextflow run . \
   -with-trace \
   -c src/common/nextflow_helpers/labels_ci.config \
   -params-file ${param_file}
+subl resources/results/grn_evaluation_all_ridge/scores.yaml
 
 # ./tw-windows-x86_64.exe launch `
 #     https://github.com/openproblems-bio/task_grn_inference.git `

diff --git a/src/control_methods/baseline_corr/script.py b/src/control_methods/baseline_corr/script.py
@@ -9,45 +9,56 @@
 ## VIASH START
 par = {
 }
+
+print(par)
+
 ## VIASH END
+
+def select_top_links(net, par):
+    net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
+    net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
+    return net
+
 def create_corr_net(X: np.ndarray, groups: np.ndarray, method="pearson"):
-    grn = pd.DataFrame()
+    i = 0
     for group in tqdm(np.unique(groups), desc="Processing groups"):
         X_sub = X[groups == group, :]
         if method == "dotproduct":
-            print('dotproduct')
             net = X_sub.T.dot(X_sub)
         elif method == "pearson":
-            print('pearson')
             net = np.corrcoef(X_sub.T)
+            net = np.nan_to_num(net, nan=0.0, posinf=0.0, neginf=0.0)
         elif method == "spearman":
             net = spearmanr(X_sub).statistic
-        net = np.nan_to_num(net, nan=0.0, posinf=0.0, neginf=0.0)
-        
+
+
         net = pd.DataFrame(net, index=gene_names, columns=gene_names)
         if par['causal']:
-            print('causal')
             net = net[tf_all]
         else:
-            print('non causal')
             net = net.sample(len(tf_all), axis=1, random_state=par['seed'])
-
+        # flatten to source-target-weight    
         net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
         net.rename(columns={'index': 'target'}, inplace=True)
 
+        net = select_top_links(net, par)
         net['cell_type'] = group
-
-        grn = pd.concat([grn, net], axis=0).reset_index(drop=True)
+        if i==0:
+            grn = net
+        else:
+            grn = pd.concat([grn, net], axis=0).reset_index(drop=True)
+
+        i += 1
 
     if par['cell_type_specific']==False:
-        print('Non specific')
         grn.drop(columns=['cell_type'], inplace=True)
         grn = grn.groupby(['source', 'target']).mean().reset_index()
-    print(grn)
+        net = select_top_links(net, par)
+
     return grn
 print('Read data')
 multiomics_rna = ad.read_h5ad(par["multiomics_rna"])
-multiomics_rna = multiomics_rna[:,:2000] #TODO: togo
+# multiomics_rna = multiomics_rna[:,:5000] #TODO: togo
 
 if par['metacell']:
     print('metacell')
@@ -108,6 +119,5 @@ def create_meta_cells(df, n_cells=15):
 
 
 
-
 print('Output GRN')
 net.to_csv(par['prediction'])
diff --git a/src/control_methods/baseline_corr/test.sh b/src/control_methods/baseline_corr/test.sh
@@ -0,0 +1,5 @@
+viash run src/control_methods/baseline_corr/config.vsh.yaml -- \
+  --prediction output/baseline_corr.csv \
+  --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --causal true
diff --git a/src/exp_analysis/config.vsh.yaml b/src/exp_analysis/config.vsh.yaml
@@ -10,22 +10,23 @@ functionality:
       type: file
       required: false
       direction: input
-      default: resources/grn-benchmark/perturbation_data.h5ad
       example: resources_test/grn-benchmark/perturbation_data.h5ad
-    - name: --tf_gene_net
+    - name: --prediction
       type: file
       required: true
       direction: input
+      example: resources_test/grn_models/collectri.csv
     - name: --peak_gene_net
       type: file
       required: false
       direction: input
+      example: resources_test/peak_gene_models/collectri.csv
     - name: --annot_peak_database
       type: file 
       required: false 
       direction: input
       must_exist: false
-      example: resources/supplements/annot_peak_database.csv
+      example: resources_test/supplementary/annot_peak_database.csv
     # - name: --annot_gene_database
     #   type: file 
     #   required: false 

diff --git a/src/exp_analysis/run.sh b/src/exp_analysis/run.sh
diff --git a/src/exp_analysis/script.py b/src/exp_analysis/script.py
@@ -6,7 +6,7 @@
 ## VIASH START
 par = {
   "perturbation_data": "resources/grn-benchmark/pertubation_data.h5ad",
-  "tf_gene_net": "resources/grn-benchmark/grn_models/figr.csv",
+  "prediction": "resources/grn-benchmark/grn_models/figr.csv",
   # "peak_gene_net": "resources/grn-benchmark/peak_gene_models/figr.csv",
   "annot_peak_database": "resources/grn-benchmark/supp/annot_peak_database.csv",
   "annot_gene_database": "resources/grn-benchmark/supp/annot_gene_database.csv",
@@ -24,13 +24,13 @@
 print('Reading input files', flush=True)
 
 perturbation_data = ad.read_h5ad(par["perturbation_data"])
-tf_gene_net = pd.read_csv(par["tf_gene_net"])
+prediction = pd.read_csv(par["prediction"])
 # peak_gene_net = pd.read_csv(par["peak_gene_net"])
 # annot_peak_database = pd.read_csv(par["annot_peak_database"])
 # hvgs = pd.read_csv(par["hvgs"])
 
 # peak_gene_net['source'] = peak_gene_net['peak']
-info_obj = Explanatory_analysis(net=tf_gene_net)
+info_obj = Explanatory_analysis(net=prediction)
 print("Calculate basic stats")
 stats = info_obj.calculate_basic_stats()
 print(stats)

diff --git a/src/exp_analysis/test.sh b/src/exp_analysis/test.sh
@@ -0,0 +1,4 @@
+viash run src/exp_analysis/config.vsh.yaml -- \
+    --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
+    --prediction output/baseline_corr.csv \
+
diff --git a/src/metrics/regression_1/test.sh b/src/metrics/regression_1/test.sh
@@ -0,0 +1,5 @@
+viash run src/metrics/regression_1/config.vsh.yaml -- \
+    --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
+    --tf_all resources/prior/tf_all.csv \
+    --prediction output/baseline_corr.csv \
+    --score output/score.h5ad