diff --git a/AWSCLIV2.pkg b/AWSCLIV2.pkg new file mode 100644 index 000000000..b3bee6251 Binary files /dev/null and b/AWSCLIV2.pkg differ diff --git a/runs.ipynb b/runs.ipynb index 8483ab88b..e7a9e2acc 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -9,7 +9,28 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting PyYAML\n", + " Downloading PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.1 kB)\n", + "Downloading PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl (171 kB)\n", + "Installing collected packages: PyYAML\n", + "Successfully installed PyYAML-6.0.2\n" + ] + } + ], + "source": [ + "!pip install PyYAML" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -110,9 +131,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 8133 × 22787\n", + " obs: 'cell_type', 'donor_id'\n", + " var: 'gene_ids', 'interval', 'mean', 'std'\n", + " uns: 'log1p'\n", + " layers: 'lognorm'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [] }, { @@ -126,7 +162,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Baseline scores on donor 0. Normalized, non cell type specific" + "### Baseline scores on donor 0. Normalized, non cell type specific, full " ] }, { @@ -357,6 +393,57 @@ "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark --delete" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "delete: resources/grn-benchmark/multiomics_rna_qc.h5ad\n", + "download: s3://openproblems-data/resources/grn/grn-benchmark/multiomics_rna.h5ad to resources/grn-benchmark/multiomics_rna.h5ad\n", + "download: s3://openproblems-data/resources/grn/grn-benchmark/multiomics_rna_0.h5ad to resources/grn-benchmark/multiomics_rna_0.h5ad\n" + ] + } + ], + "source": [ + "!aws s3 sync s3://openproblems-data/resources/grn/grn-benchmark resources/grn-benchmark --delete" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# To be categorized" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if par['metacell']:\n", + " print('metacell')\n", + " def create_meta_cells(df, n_cells=15):\n", + " meta_x = []\n", + " for i in range(0, df.shape[0], n_cells):\n", + " meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n", + " df = pd.DataFrame(meta_x, columns=df.columns)\n", + " return df\n", + " \n", + " adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n", + " adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n", + " adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n", + " df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n", + " X = df.values\n", + " var = pd.DataFrame(index=df.columns)\n", + " obs = df.reset_index()[['cell_type','donor_id']]\n", + " multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index fbe58e220..5842a4e6e 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -58,11 +58,8 @@ functionality: default: false description: subset rna seq data to only 7000 hvgs to reduce dimensionality - - - resources: - - path: ../utils/util.py + - path: /src/utils/util.py test_resources: - type: python_script diff --git a/src/control_methods/baseline_corr/config.vsh.yaml b/src/control_methods/baseline_corr/config.vsh.yaml deleted file mode 100644 index 6d96624f3..000000000 --- a/src/control_methods/baseline_corr/config.vsh.yaml +++ /dev/null @@ -1,45 +0,0 @@ -__merge__: ../../api/comp_method.yaml - -functionality: - name: baseline_corr - namespace: control_methods - info: - label: baseline_corr - summary: "Baseline based on correlation" - arguments: - - name: --causal - type: boolean - direction: input - default: false - - name: --corr_method - type: string - required: false - direction: input - default: dotproduct - description: corr method. - - name: --metacell - type: boolean - direction: input - default: false - description: whether to pseudobulk scRNA-seq with metacells - - name: --impute - type: boolean - direction: input - default: false - description: whether to impute scRNA-seq - - resources: - - type: python_script - path: script.py - -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - # packages: [ magic-impute ] - packages: [ ] - - type: native - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/control_methods/baseline_corr/script.py b/src/control_methods/baseline_corr/script.py deleted file mode 100644 index e4d081219..000000000 --- a/src/control_methods/baseline_corr/script.py +++ /dev/null @@ -1,118 +0,0 @@ -import os -import pandas as pd -import numpy as np -import anndata as ad -import scanpy as sc -from tqdm import tqdm -from scipy.stats import spearmanr -from sklearn.preprocessing import StandardScaler - -## VIASH START -par = { - 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad', - 'tf_all': 'resources/prior/tf_all.csv', - 'causal': True, - 'metacell': False, - 'cell_type_specific': False, - 'impute': False, - 'max_n_links': 50000, - 'corr_method': 'pearson', - 'prediction': 'resources/grn_models/donor_0_default/pearson_causal.csv', - "seed": 32 -} -## VIASH END -print(par) - -def process_links(net, par): - net = net[net.source!=net.target] - net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) - net = net_sorted.head(par['max_n_links']).reset_index(drop=True) - return net - -def corr_net(X, gene_names, par): - if par['corr_method'] == "pearson": - X = StandardScaler().fit_transform(X) - net = np.dot(X.T, X) / X.shape[0] - elif par['corr_method'] == "spearman": - net = spearmanr(X).statistic - net = pd.DataFrame(net, index=gene_names, columns=gene_names) - - if par['causal']: - net = net[tf_all] - else: - net = net.sample(len(tf_all), axis=1, random_state=par['seed']) - net = net.reset_index() - index_name = net.columns[0] - net = net.melt(id_vars=index_name, var_name='source', value_name='weight') - - net.rename(columns={index_name: 'target'}, inplace=True) - net = process_links(net, par) - - return net - -def create_corr_net(X, gene_names, groups, par): - # if par['cell_type_specific']: - if True: - i = 0 - for group in tqdm(np.unique(groups), desc="Processing groups"): - X_sub = X[groups == group, :] - net = corr_net(X_sub, gene_names, par) - net['cell_type'] = group - if i==0: - grn = net - else: - grn = pd.concat([grn, net], axis=0).reset_index(drop=True) - i += 1 - else: - grn = corr_net(X, gene_names, par) - # grn.drop(columns=['cell_type'], inplace=True) - # grn = grn.groupby(['source', 'target']).mean().reset_index() - # grn = process_links(grn, par) - return grn -print('Read data') -multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) -# multiomics_rna = multiomics_rna[:,:5000] #TODO: togo - -if par['metacell']: - print('metacell') - def create_meta_cells(df, n_cells=15): - meta_x = [] - for i in range(0, df.shape[0], n_cells): - meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values) - df = pd.DataFrame(meta_x, columns=df.columns) - return df - - adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names) - adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values - adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values - df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df)) - X = df.values - var = pd.DataFrame(index=df.columns) - obs = df.reset_index()[['cell_type','donor_id']] - multiomics_rna = ad.AnnData(X=X, obs=obs, var=var) - -gene_names = multiomics_rna.var_names.to_numpy() -tf_all = np.loadtxt(par['tf_all'], dtype=str) -groups = multiomics_rna.obs.cell_type -tf_all = np.intersect1d(tf_all, gene_names) - -print('Noramlize data') -sc.pp.normalize_total(multiomics_rna) -sc.pp.log1p(multiomics_rna) -sc.pp.scale(multiomics_rna) - -if par['impute']: - print("imputing") - import magic - import scprep - - magic_operator = magic.MAGIC() - - multiomics_rna = magic_operator.fit_transform(multiomics_rna) - - print('zero ration: ', (multiomics_rna.X==0).sum()/multiomics_rna.X.size) -print('Create corr net') -net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par) - -print('Output GRN') -net.to_csv(par['prediction']) diff --git a/src/control_methods/baseline_corr/test.sh b/src/control_methods/baseline_corr/test.sh deleted file mode 100644 index 884e70266..000000000 --- a/src/control_methods/baseline_corr/test.sh +++ /dev/null @@ -1,5 +0,0 @@ -viash run src/control_methods/baseline_corr/config.vsh.yaml -- \ - --prediction output/baseline_corr.csv \ - --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ - --causal true diff --git a/src/control_methods/pearson/script.py b/src/control_methods/pearson/script.py index f2396c957..29a54cff7 100644 --- a/src/control_methods/pearson/script.py +++ b/src/control_methods/pearson/script.py @@ -1,18 +1,8 @@ -import os -import pandas as pd -import numpy as np -import anndata as ad -import scanpy as sc -from tqdm import tqdm -from scipy.stats import spearmanr -from sklearn.preprocessing import StandardScaler -from utils.util import process_data, create_corr_net ## VIASH START par = { 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad', 'tf_all': 'resources/prior/tf_all.csv', - 'causal': False, 'cell_type_specific': True, 'max_n_links': 50000, 'prediction': 'resources/grn_models/donor_0_default/pearson.csv', @@ -22,20 +12,11 @@ } ## VIASH END print(par) +import sys +sys.path.append('./src/utils') +from util import create_corr_net -print('Read data') -multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) -process_data(multiomics_rna) - -gene_names = multiomics_rna.var_names.to_numpy() -tf_all = np.loadtxt(par['tf_all'], dtype=str) -groups = multiomics_rna.obs.cell_type -tf_all = np.intersect1d(tf_all, gene_names) -n_tfs = len(tf_all) - - -print('Create corr net') -net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par, tf_all=None) - +par['causal'] = False +net = create_corr_net(par) print('Output GRN') net.to_csv(par['prediction']) diff --git a/src/control_methods/pearson/test.sh b/src/control_methods/pearson/test.sh index 884e70266..14cf51c2c 100644 --- a/src/control_methods/pearson/test.sh +++ b/src/control_methods/pearson/test.sh @@ -1,5 +1,4 @@ -viash run src/control_methods/baseline_corr/config.vsh.yaml -- \ +viash run src/control_methods/pearson/config.vsh.yaml -- \ --prediction output/baseline_corr.csv \ --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ - --causal true + --tf_all resources/prior/tf_all.csv \ No newline at end of file diff --git a/src/control_methods/pearson_causal/script.py b/src/control_methods/pearson_causal/script.py index 36860fa09..ad913af35 100644 --- a/src/control_methods/pearson_causal/script.py +++ b/src/control_methods/pearson_causal/script.py @@ -1,78 +1,21 @@ -import os -import pandas as pd -import numpy as np -import anndata as ad -import scanpy as sc -from tqdm import tqdm -from scipy.stats import spearmanr -from sklearn.preprocessing import StandardScaler ## VIASH START par = { 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad', 'tf_all': 'resources/prior/tf_all.csv', - 'causal': True, 'cell_type_specific': True, 'max_n_links': 50000, 'prediction': 'resources/grn_models/donor_0_default/pearson_causal.csv', "seed": 32 } ## VIASH END -print(par) +import sys +sys.path.append('./src/utils') +from util import create_corr_net -def process_links(net, par): - net = net[net.source!=net.target] - net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) - net = net_sorted.head(par['max_n_links']).reset_index(drop=True) - return net - -def corr_net(X, gene_names, par): - X = StandardScaler().fit_transform(X) - net = np.dot(X.T, X) / X.shape[0] - net = pd.DataFrame(net, index=gene_names, columns=gene_names) - - net = net[tf_all] - net = net.reset_index() - index_name = net.columns[0] - net = net.melt(id_vars=index_name, var_name='source', value_name='weight') - - net.rename(columns={index_name: 'target'}, inplace=True) - net = process_links(net, par) - - return net - -def create_corr_net(X, gene_names, groups, par): - if par['cell_type_specific']: - i = 0 - for group in tqdm(np.unique(groups), desc="Processing groups"): - X_sub = X[groups == group, :] - net = corr_net(X_sub, gene_names, par) - net['cell_type'] = group - if i==0: - grn = net - else: - grn = pd.concat([grn, net], axis=0).reset_index(drop=True) - i += 1 - else: - grn = corr_net(X, gene_names, par) - return grn -print('Read data') -multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) - - -gene_names = multiomics_rna.var_names.to_numpy() -tf_all = np.loadtxt(par['tf_all'], dtype=str) -groups = multiomics_rna.obs.cell_type -tf_all = np.intersect1d(tf_all, gene_names) - -if par['normalize']: - print('Noramlize data') - sc.pp.normalize_total(multiomics_rna) - sc.pp.log1p(multiomics_rna) - sc.pp.scale(multiomics_rna) - -print('Create corr net') -net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par) +print('Create causal corr net') +par['causal'] = True +net = create_corr_net(par) print('Output GRN') net.to_csv(par['prediction']) diff --git a/src/control_methods/positive_control/script.py b/src/control_methods/positive_control/script.py index 46732a323..1e94e605a 100644 --- a/src/control_methods/positive_control/script.py +++ b/src/control_methods/positive_control/script.py @@ -15,65 +15,19 @@ 'cell_type_specific': True, 'max_n_links': 50000, 'prediction': 'resources/grn_models/positive_control.csv', - "seed": 32 -} + "seed": 32, + 'normalize': False} ## VIASH END -print(par) +import sys +sys.path.append('./src/utils') +from util import create_corr_net -def process_links(net, par): - net = net[net.source!=net.target] - net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) - net = net_sorted.head(par['max_n_links']).reset_index(drop=True) - return net +print('Create causal corr net') +par['causal'] = True +par['multiomics_rna'] = par['perturbation_data'] +par['only_hvgs'] = False -def corr_net(X, gene_names, par): - X = StandardScaler().fit_transform(X) - net = np.dot(X.T, X) / X.shape[0] - net = pd.DataFrame(net, index=gene_names, columns=gene_names) - - net = net[tf_all] - net = net.reset_index() - index_name = net.columns[0] - net = net.melt(id_vars=index_name, var_name='source', value_name='weight') - - net.rename(columns={index_name: 'target'}, inplace=True) - net = process_links(net, par) - - return net - -def create_corr_net(X, gene_names, groups, par): - if par['cell_type_specific']: - i = 0 - for group in tqdm(np.unique(groups), desc="Processing groups"): - X_sub = X[groups == group, :] - net = corr_net(X_sub, gene_names, par) - net['cell_type'] = group - if i==0: - grn = net - else: - grn = pd.concat([grn, net], axis=0).reset_index(drop=True) - i += 1 - else: - grn = corr_net(X, gene_names, par) - return grn -print('Read data') -multiomics_rna = ad.read_h5ad(par["perturbation_data"]) - - -if par['normalize']: - print('Noramlize data') - sc.pp.normalize_total(multiomics_rna) - sc.pp.log1p(multiomics_rna) - sc.pp.scale(multiomics_rna) - -gene_names = multiomics_rna.var_names.to_numpy() -tf_all = np.loadtxt(par['tf_all'], dtype=str) -groups = multiomics_rna.obs.cell_type -tf_all = np.intersect1d(tf_all, gene_names) - - -print('Create corr net') -net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par) +net = create_corr_net(par) print('Output GRN') net.to_csv(par['prediction']) diff --git a/src/utils/util.py b/src/utils/util.py index bac97c72e..0b78914e3 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -3,6 +3,10 @@ import numpy as np from tqdm import tqdm import scanpy as sc +from sklearn.preprocessing import StandardScaler +import scipy.sparse as sp + + def process_links(net, par): net = net[net.source!=net.target] @@ -10,12 +14,12 @@ def process_links(net, par): net = net_sorted.head(par['max_n_links']).reset_index(drop=True) return net -def corr_net(X, gene_names, par, tf_all=None): +def corr_net(X, gene_names, par, tf_all, causal=False): X = StandardScaler().fit_transform(X) net = np.dot(X.T, X) / X.shape[0] net = pd.DataFrame(net, index=gene_names, columns=gene_names) - if tf_all is None: - net = net.sample(n_tfs, axis=1, random_state=par['seed']) + if causal: + net = net.sample(len(tf_all), axis=1, random_state=par['seed']) else: net = net[tf_all] net = net.reset_index() @@ -27,12 +31,39 @@ def corr_net(X, gene_names, par, tf_all=None): return net -def create_corr_net(X, gene_names, groups, par, tf_all): +def process_data(adata, par): + if par['normalize']: + print('Noramlize data') + sc.pp.normalize_total(adata) + sc.pp.log1p(adata) + sc.pp.scale(adata) + if sp.issparse(adata.X): + # Convert to dense if it's sparse + adata.X = adata.X.toarray() # You can also use .todense(), but .toarray() gives a NumPy array directly + else: + print("adata.X is already dense.") + if par['only_hvgs']: + print('Subsetting data to hvgs') + adata = adata[:, adata.var.hvg_counts] + print('New dimension of data: ', adata.shape) + +def create_corr_net(par): + print(par) + + print('Read data') + multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) + process_data(multiomics_rna, par) + gene_names = multiomics_rna.var_names.to_numpy() + tf_all = np.loadtxt(par['tf_all'], dtype=str) + groups = multiomics_rna.obs.cell_type + tf_all = np.intersect1d(tf_all, gene_names) + + X = multiomics_rna.X if par['cell_type_specific']: i = 0 for group in tqdm(np.unique(groups), desc="Processing groups"): X_sub = X[groups == group, :] - net = corr_net(X_sub, gene_names, par) + net = corr_net(X_sub, gene_names, par, tf_all, par['causal']) net['cell_type'] = group if i==0: grn = net @@ -40,16 +71,5 @@ def create_corr_net(X, gene_names, groups, par, tf_all): grn = pd.concat([grn, net], axis=0).reset_index(drop=True) i += 1 else: - grn = corr_net(X, gene_names, par) + grn = corr_net(X, gene_names, par, tf_all, par['causal']) return grn -def process_data(adata, par): - if par['normalize']: - print('Noramlize data') - sc.pp.normalize_total(adata) - sc.pp.log1p(adata) - sc.pp.scale(adata) - adata.X = adata.X.todense() - if par['only_hvgs']: - print('Subsetting data to hvgs') - adata = adata[:, adata.var.hvg_counts] - print('New dimension of data: ', adata.shape) \ No newline at end of file