reg2 clipped

openproblems-bio · Aug 18, 2024 · 06a699a · 06a699a
1 parent 00c7c93
commit 06a699a
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 28 deletions.
diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
@@ -180,8 +180,18 @@ def main(par):
     layer = par["layer"]
     pert_df = pd.DataFrame(perturbation_data.layers[layer], columns=gene_names)
 
-    if subsample != -1:
-        pert_df = pert_df.sample(n=subsample)
+    if subsample == -1:
+        pass
+    elif subsample == -2: # one combination of cell_type, sm_name
+        sampled_obs = perturbation_data.obs.groupby(['sm_name', 'cell_type'], observed=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
+        obs = perturbation_data.obs
+        mask = []
+        for _, row in obs.iterrows():
+            mask.append((sampled_obs==row).all(axis=1).any())  
+        perturbation_data = perturbation_data[mask,:]
+    else:
+        perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]
+
     pert_df = pert_df.T  # make it gene*sample
 
     # process net

diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py
@@ -20,12 +20,6 @@
 
 output = main(par) 
 print(output)
-# output.columns = ['S1', 'S2', 'S3', 'S4']
-# output.index=[par["layer"]]
-# print("Write output to file", flush=True)
-# print(output)
-# output.to_csv(par['score'])
-
 
 metric_ids = output.columns.to_numpy()
 metric_values = output.values[0]

diff --git a/src/metrics/regression_1/script_all.py b/src/metrics/regression_1/script_all.py
@@ -3,44 +3,90 @@
 import sys
 import numpy as np
 import os 
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler 
 
-## VIASH START
 par = {
   "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
-  'max_workers': 4,
+  'max_workers': 40,
   'reg_type': 'ridge',
-  'subsample': 2,
+  'subsample': -2,
+  "tf_all":  "./resources/prior/tf_all.csv",
+  "temp_dir": "output"
 }
 
-## VIASH END
+def create_positive_control(X: np.ndarray, groups: np.ndarray):
+    grns = []
+    for group in tqdm(np.unique(groups), desc="Processing groups"):
+        X_sub = X[groups == group, :]
+        X_sub = StandardScaler().fit_transform(X_sub)
+        grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
+        grns.append(grn)
+    return np.mean(grns, axis=0)
+def create_negative_control(gene_names) -> np.ndarray:
+    ratio = [.98, .01, 0.01]
+    n_tf = 400
+    net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio)
+    net = pd.DataFrame(net, index=gene_names)
+    return net
+
+
 print('Reading input data')
+perturbation_data = ad.read_h5ad(par["perturbation_data"])
+gene_names = perturbation_data.var_names.to_numpy()
+tf_all = np.loadtxt(par['tf_all'], dtype=str)
+groups = perturbation_data.obs.cell_type
 
 meta = {
   "resources_dir":'./'
 }
 sys.path.append(meta["resources_dir"])
 from main import main 
 layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson']
-grn_models = ['scenicplus','celloracle','figr','granie','scglue','collectri']
+grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri']
+controls = ['negative_control', 'positive_control']
 
-os.makedirs('output', exist_ok=True)
-for grn_model in grn_models:
-  par["score"] = f"output/{grn_model}.csv"
+os.makedirs(par['temp_dir'], exist_ok=True)
+for grn_model in controls + grn_models :
+  par["score"] = f"{par['temp_dir']}/reg1-{grn_model}.csv"
   for ii, layer in enumerate(layers):
-    par['prediction'] = f"resources/grn-benchmark/grn_models/{grn_model}.csv"
     par["layer"] = layer
+    if grn_model=='positive_control':
+      # print('Inferring GRN')
+      # net = create_positive_control(perturbation_data.layers[par["layer"]], groups)
+
+      # net = pd.DataFrame(net, index=gene_names, columns=gene_names)
+      # net = net.loc[:, net.columns.isin(tf_all)]
+
+      # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
+
+      # pivoted_net = pivoted_net.rename(columns={'index': 'target'})
+      # pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
+      par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv"
+      # pivoted_net.to_csv(par['prediction'])
+    elif grn_model=='negative_control':
+      # print('Inferring GRN')
+      # net = create_negative_control(gene_names)
 
+      # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
+
+      # pivoted_net = pivoted_net.rename(columns={'index': 'target'})
+      # pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
+      par['prediction'] = f"{par['temp_dir']}/negative_control.csv"
+      # pivoted_net.to_csv(par['prediction'])
+    else:
+      par['prediction'] = f"resources/grn_models/{grn_model}.csv"
     output = main(par) 
     output.index = [layer]
 
-
     if ii == 0:
       score = output
     else:
-      score = pd.concat([score,output], axis=0)
+      score = pd.concat([score, output], axis=0)
 
     print("Write output to file", flush=True)
     print(grn_model, layer, score)
 
   print("Write output to file", flush=True)
-  score.to_csv(par['score'])
+  score.to_csv(par['score'])
+
diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -50,7 +50,7 @@ def cross_validate_gene(
         n_jobs: int = 10
 ) -> Dict[str, float]:
 
-    results = {'r2': 0.0, 'mse': np.inf, 'avg-r2': 0.0}
+    results = {'r2': 0.0, 'avg-r2': 0.0}
 
     if n_features == 0:
         return results
@@ -89,9 +89,9 @@ def cross_validate_gene(
     y_pred = np.concatenate(y_pred, axis=0)
     y_target = np.concatenate(y_target, axis=0)
 
-    results['r2'] = float(r2_score(y_target, y_pred))
-    results['mse'] = float(mean_squared_error(y_target, y_pred))
+    # results['r2'] = float(r2_score(y_target, y_pred))
     results['avg-r2'] = float(np.mean(r2s))
+
 
     return results
 
@@ -197,8 +197,10 @@ def static_approach(grn: np.ndarray, n_features: np.ndarray, X: np.ndarray, grou
 
     # Cross-validate each gene using the inferred GRN to define select input features
     res = cross_validate(reg_type, gene_names, X, groups, grn, n_features, n_jobs=n_jobs)
+    mean_r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
+    mean_r2_scores = mean_r2_scores[mean_r2_scores>-1]
 
-    return np.mean([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
+    return np.mean(mean_r2_scores)
 
 
 def main(par: Dict[str, Any]) -> pd.DataFrame:
@@ -214,16 +216,17 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
     # Load perturbation data
     perturbation_data = ad.read_h5ad(par['perturbation_data'])
     subsample = par['subsample']
-    if subsample != -1:
-        perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]
-
-    if True: # one combination of cell_type, sm_name
+    if subsample == -1:
+        pass
+    elif subsample == -2: # one combination of cell_type, sm_name
         sampled_obs = perturbation_data.obs.groupby(['sm_name', 'cell_type'], observed=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
         obs = perturbation_data.obs
         mask = []
         for _, row in obs.iterrows():
             mask.append((sampled_obs==row).all(axis=1).any())  
         perturbation_data = perturbation_data[mask,:]
+    else:
+        perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]
 
     gene_names = perturbation_data.var.index.to_numpy()
     n_genes = len(gene_names)

diff --git a/src/metrics/regression_2/script_all.py b/src/metrics/regression_2/script_all.py
@@ -0,0 +1,93 @@
+import pandas as pd
+import anndata as ad
+import sys
+import numpy as np
+import os 
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler 
+
+par = {
+  "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
+  'max_workers': 10,
+  'reg_type': 'ridge',
+  'subsample': 2,
+  "tf_all":  "./resources/prior/tf_all.csv",
+  "temp_dir": "output",
+  'consensus': 'resources/prior/consensus-num-regulators.json',
+}
+
+def create_positive_control(X: np.ndarray, groups: np.ndarray):
+    grns = []
+    for group in tqdm(np.unique(groups), desc="Processing groups"):
+        X_sub = X[groups == group, :]
+        X_sub = StandardScaler().fit_transform(X_sub)
+        grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
+        grns.append(grn)
+    return np.mean(grns, axis=0)
+def create_negative_control(gene_names) -> np.ndarray:
+    ratio = [.98, .01, 0.01]
+    n_tf = 400
+    net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio)
+    net = pd.DataFrame(net, index=gene_names)
+    return net
+
+
+print('Reading input data')
+perturbation_data = ad.read_h5ad(par["perturbation_data"])
+gene_names = perturbation_data.var_names.to_numpy()
+tf_all = np.loadtxt(par['tf_all'], dtype=str)
+groups = perturbation_data.obs.cell_type
+
+meta = {
+  "resources_dir":'./'
+}
+sys.path.append(meta["resources_dir"])
+from main import main 
+layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson']
+grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri']
+controls = ['negative_control', 'positive_control']
+
+os.makedirs(par['temp_dir'], exist_ok=True)
+for grn_model in controls + grn_models :
+  par["score"] = f"{par['temp_dir']}/reg2-{grn_model}.csv"
+  for ii, layer in enumerate(layers):
+    par["layer"] = layer
+    if grn_model=='positive_control':
+      # print('Inferring GRN')
+      # net = create_positive_control(perturbation_data.layers[par["layer"]], groups)
+
+      # net = pd.DataFrame(net, index=gene_names, columns=gene_names)
+      # net = net.loc[:, net.columns.isin(tf_all)]
+
+      # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
+
+      # pivoted_net = pivoted_net.rename(columns={'index': 'target'})
+      # pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
+      par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv"
+      # pivoted_net.to_csv(par['prediction'])
+    elif grn_model=='negative_control':
+      # print('Inferring GRN')
+      # net = create_negative_control(gene_names)
+
+      # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
+
+      # pivoted_net = pivoted_net.rename(columns={'index': 'target'})
+      # pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
+      par['prediction'] = f"{par['temp_dir']}/negative_control.csv"
+      # pivoted_net.to_csv(par['prediction'])
+    else:
+      par['prediction'] = f"resources/grn_models/{grn_model}.csv"
+    output = main(par) 
+    output.index = [layer]
+
+    if ii == 0:
+      score = output
+    else:
+      score = pd.concat([score, output], axis=0)
+
+    print("Write output to file", flush=True)
+    print(grn_model, layer, score)
+
+  print("Write output to file", flush=True)
+  score.to_csv(par['score'])
+