Skip to content

Commit

Permalink
reg2 clipped
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Aug 18, 2024
1 parent 00c7c93 commit 06a699a
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 28 deletions.
14 changes: 12 additions & 2 deletions src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,18 @@ def main(par):
layer = par["layer"]
pert_df = pd.DataFrame(perturbation_data.layers[layer], columns=gene_names)

if subsample != -1:
pert_df = pert_df.sample(n=subsample)
if subsample == -1:
pass
elif subsample == -2: # one combination of cell_type, sm_name
sampled_obs = perturbation_data.obs.groupby(['sm_name', 'cell_type'], observed=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
obs = perturbation_data.obs
mask = []
for _, row in obs.iterrows():
mask.append((sampled_obs==row).all(axis=1).any())
perturbation_data = perturbation_data[mask,:]
else:
perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]

pert_df = pert_df.T # make it gene*sample

# process net
Expand Down
6 changes: 0 additions & 6 deletions src/metrics/regression_1/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@

output = main(par)
print(output)
# output.columns = ['S1', 'S2', 'S3', 'S4']
# output.index=[par["layer"]]
# print("Write output to file", flush=True)
# print(output)
# output.to_csv(par['score'])


metric_ids = output.columns.to_numpy()
metric_values = output.values[0]
Expand Down
70 changes: 58 additions & 12 deletions src/metrics/regression_1/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,90 @@
import sys
import numpy as np
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

## VIASH START
par = {
"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
'max_workers': 4,
'max_workers': 40,
'reg_type': 'ridge',
'subsample': 2,
'subsample': -2,
"tf_all": "./resources/prior/tf_all.csv",
"temp_dir": "output"
}

## VIASH END
def create_positive_control(X: np.ndarray, groups: np.ndarray):
grns = []
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
X_sub = StandardScaler().fit_transform(X_sub)
grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
grns.append(grn)
return np.mean(grns, axis=0)
def create_negative_control(gene_names) -> np.ndarray:
ratio = [.98, .01, 0.01]
n_tf = 400
net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio)
net = pd.DataFrame(net, index=gene_names)
return net


print('Reading input data')
perturbation_data = ad.read_h5ad(par["perturbation_data"])
gene_names = perturbation_data.var_names.to_numpy()
tf_all = np.loadtxt(par['tf_all'], dtype=str)
groups = perturbation_data.obs.cell_type

meta = {
"resources_dir":'./'
}
sys.path.append(meta["resources_dir"])
from main import main
layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson']
grn_models = ['scenicplus','celloracle','figr','granie','scglue','collectri']
grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri']
controls = ['negative_control', 'positive_control']

os.makedirs('output', exist_ok=True)
for grn_model in grn_models:
par["score"] = f"output/{grn_model}.csv"
os.makedirs(par['temp_dir'], exist_ok=True)
for grn_model in controls + grn_models :
par["score"] = f"{par['temp_dir']}/reg1-{grn_model}.csv"
for ii, layer in enumerate(layers):
par['prediction'] = f"resources/grn-benchmark/grn_models/{grn_model}.csv"
par["layer"] = layer
if grn_model=='positive_control':
# print('Inferring GRN')
# net = create_positive_control(perturbation_data.layers[par["layer"]], groups)

# net = pd.DataFrame(net, index=gene_names, columns=gene_names)
# net = net.loc[:, net.columns.isin(tf_all)]

# pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')

# pivoted_net = pivoted_net.rename(columns={'index': 'target'})
# pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv"
# pivoted_net.to_csv(par['prediction'])
elif grn_model=='negative_control':
# print('Inferring GRN')
# net = create_negative_control(gene_names)

# pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')

# pivoted_net = pivoted_net.rename(columns={'index': 'target'})
# pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
par['prediction'] = f"{par['temp_dir']}/negative_control.csv"
# pivoted_net.to_csv(par['prediction'])
else:
par['prediction'] = f"resources/grn_models/{grn_model}.csv"
output = main(par)
output.index = [layer]


if ii == 0:
score = output
else:
score = pd.concat([score,output], axis=0)
score = pd.concat([score, output], axis=0)

print("Write output to file", flush=True)
print(grn_model, layer, score)

print("Write output to file", flush=True)
score.to_csv(par['score'])
score.to_csv(par['score'])

19 changes: 11 additions & 8 deletions src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def cross_validate_gene(
n_jobs: int = 10
) -> Dict[str, float]:

results = {'r2': 0.0, 'mse': np.inf, 'avg-r2': 0.0}
results = {'r2': 0.0, 'avg-r2': 0.0}

if n_features == 0:
return results
Expand Down Expand Up @@ -89,9 +89,9 @@ def cross_validate_gene(
y_pred = np.concatenate(y_pred, axis=0)
y_target = np.concatenate(y_target, axis=0)

results['r2'] = float(r2_score(y_target, y_pred))
results['mse'] = float(mean_squared_error(y_target, y_pred))
# results['r2'] = float(r2_score(y_target, y_pred))
results['avg-r2'] = float(np.mean(r2s))


return results

Expand Down Expand Up @@ -197,8 +197,10 @@ def static_approach(grn: np.ndarray, n_features: np.ndarray, X: np.ndarray, grou

# Cross-validate each gene using the inferred GRN to define select input features
res = cross_validate(reg_type, gene_names, X, groups, grn, n_features, n_jobs=n_jobs)
mean_r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
mean_r2_scores = mean_r2_scores[mean_r2_scores>-1]

return np.mean([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
return np.mean(mean_r2_scores)


def main(par: Dict[str, Any]) -> pd.DataFrame:
Expand All @@ -214,16 +216,17 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
# Load perturbation data
perturbation_data = ad.read_h5ad(par['perturbation_data'])
subsample = par['subsample']
if subsample != -1:
perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]

if True: # one combination of cell_type, sm_name
if subsample == -1:
pass
elif subsample == -2: # one combination of cell_type, sm_name
sampled_obs = perturbation_data.obs.groupby(['sm_name', 'cell_type'], observed=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
obs = perturbation_data.obs
mask = []
for _, row in obs.iterrows():
mask.append((sampled_obs==row).all(axis=1).any())
perturbation_data = perturbation_data[mask,:]
else:
perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]

gene_names = perturbation_data.var.index.to_numpy()
n_genes = len(gene_names)
Expand Down
93 changes: 93 additions & 0 deletions src/metrics/regression_2/script_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pandas as pd
import anndata as ad
import sys
import numpy as np
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

par = {
"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
'max_workers': 10,
'reg_type': 'ridge',
'subsample': 2,
"tf_all": "./resources/prior/tf_all.csv",
"temp_dir": "output",
'consensus': 'resources/prior/consensus-num-regulators.json',
}

def create_positive_control(X: np.ndarray, groups: np.ndarray):
grns = []
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
X_sub = StandardScaler().fit_transform(X_sub)
grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
grns.append(grn)
return np.mean(grns, axis=0)
def create_negative_control(gene_names) -> np.ndarray:
ratio = [.98, .01, 0.01]
n_tf = 400
net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio)
net = pd.DataFrame(net, index=gene_names)
return net


print('Reading input data')
perturbation_data = ad.read_h5ad(par["perturbation_data"])
gene_names = perturbation_data.var_names.to_numpy()
tf_all = np.loadtxt(par['tf_all'], dtype=str)
groups = perturbation_data.obs.cell_type

meta = {
"resources_dir":'./'
}
sys.path.append(meta["resources_dir"])
from main import main
layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson']
grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri']
controls = ['negative_control', 'positive_control']

os.makedirs(par['temp_dir'], exist_ok=True)
for grn_model in controls + grn_models :
par["score"] = f"{par['temp_dir']}/reg2-{grn_model}.csv"
for ii, layer in enumerate(layers):
par["layer"] = layer
if grn_model=='positive_control':
# print('Inferring GRN')
# net = create_positive_control(perturbation_data.layers[par["layer"]], groups)

# net = pd.DataFrame(net, index=gene_names, columns=gene_names)
# net = net.loc[:, net.columns.isin(tf_all)]

# pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')

# pivoted_net = pivoted_net.rename(columns={'index': 'target'})
# pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv"
# pivoted_net.to_csv(par['prediction'])
elif grn_model=='negative_control':
# print('Inferring GRN')
# net = create_negative_control(gene_names)

# pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')

# pivoted_net = pivoted_net.rename(columns={'index': 'target'})
# pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
par['prediction'] = f"{par['temp_dir']}/negative_control.csv"
# pivoted_net.to_csv(par['prediction'])
else:
par['prediction'] = f"resources/grn_models/{grn_model}.csv"
output = main(par)
output.index = [layer]

if ii == 0:
score = output
else:
score = pd.concat([score, output], axis=0)

print("Write output to file", flush=True)
print(grn_model, layer, score)

print("Write output to file", flush=True)
score.to_csv(par['score'])

0 comments on commit 06a699a

Please sign in to comment.