diff --git a/runs.ipynb b/runs.ipynb index a7597aa6e..cd337f2bc 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -1,5 +1,10 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -105,30 +110,136 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Pending runs " + "# Marco data " ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 8133 × 22787\n", - " obs: 'cell_type', 'donor_id'\n", - " var: 'gene_ids', 'interval', 'mean', 'std'\n", - " uns: 'log1p'\n", - " layers: 'lognorm'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "__MACOSX\t\t\t\t shalek.h5ad\n", + "han.h5ad\t\t\t\t shalek_KDunion.csv\n", + "han_KDunion.csv\t\t\t\t shalek_chipunion.csv\n", + "han_chipunion.csv\t\t\t shalek_chipunion_KDUnion_intersect.csv\n", + "han_chipunion_KDUnion_intersect.csv\t zhao.h5ad\n", + "jackson.h5ad\t\t\t\t zhao_KDunion.csv\n", + "jackson_KDunion.csv\t\t\t zhao_chipunion.csv\n", + "jackson_chipunion.csv\t\t\t zhao_chipunion_KDUnion_intersect.csv\n", + "jackson_chipunion_KDUnion_intersect.csv\n" + ] } ], - "source": [] + "source": [ + "!ls resources_local/mccalla_extended/" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "zhao-KDunion. adata shape: (36199, 8442), GT size: (105136, 3), Gene overlap: (8425,)\n", + "/viash_automount/tmp/viash-run-regression_1-8RZIHd.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.020129 0.03817 0.009021\n", + "(3,) (3,)\n", + "zhao-chipunion. adata shape: (36199, 8442), GT size: (60662, 3), Gene overlap: (8378,)\n", + "/viash_automount/tmp/viash-run-regression_1-rLuWXt.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.006815 0.064118 0.028652\n", + "(3,) (3,)\n", + "zhao-chipunion_KDUnion_intersect. adata shape: (36199, 8442), GT size: (9019, 3), Gene overlap: (4493,)\n", + "/viash_automount/tmp/viash-run-regression_1-nsE1Jg.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.004249 0.000488 -0.00188\n", + "(3,) (3,)\n", + "shalek-KDunion. adata shape: (1211, 9411), GT size: (148047, 3), Gene overlap: (8784,)\n", + "/viash_automount/tmp/viash-run-regression_1-54532G.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.010243 0.104809 0.047283\n", + "(3,) (3,)\n", + "shalek-chipunion. adata shape: (1211, 9411), GT size: (96383, 3), Gene overlap: (8670,)\n", + "/viash_automount/tmp/viash-run-regression_1-9tO5h1.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.003727 0.042922 0.019598\n", + "(3,) (3,)\n", + "shalek-chipunion_KDUnion_intersect. adata shape: (1211, 9411), GT size: (67705, 3), Gene overlap: (7852,)\n", + "/viash_automount/tmp/viash-run-regression_1-5SN7ZR.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.002252 0.094459 0.046103\n", + "(3,) (3,)\n", + "han-KDunion. adata shape: (5520, 7465), GT size: (77400, 3), Gene overlap: (7387,)\n", + "/viash_automount/tmp/viash-run-regression_1-eSKe1T.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.019109 0.031554 0.006222\n", + "(3,) (3,)\n", + "han-chipunion. adata shape: (5520, 7465), GT size: (160038, 3), Gene overlap: (7458,)\n", + "/viash_automount/tmp/viash-run-regression_1-VnCHsA.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.013187 0.071785 0.029299\n", + "(3,) (3,)\n", + "han-chipunion_KDUnion_intersect. adata shape: (5520, 7465), GT size: (8463, 3), Gene overlap: (4141,)\n", + "/viash_automount/tmp/viash-run-regression_1-0OpbTu.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.004217 0.004052 -0.000082\n", + "(3,) (3,)\n", + "jackson-KDunion. adata shape: (17396, 5736), GT size: (27433, 3), Gene overlap: (4785,)\n", + "/viash_automount/tmp/viash-run-regression_1-wvATsv.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.078523 0.238183 0.07983\n", + "(3,) (3,)\n", + "jackson-chipunion. adata shape: (17396, 5736), GT size: (24481, 3), Gene overlap: (4898,)\n", + "/viash_automount/tmp/viash-run-regression_1-GudAWm.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 -0.027772 0.186608 0.079418\n", + "(3,) (3,)\n", + "jackson-chipunion_KDUnion_intersect. adata shape: (17396, 5736), GT size: (2661, 3), Gene overlap: (1515,)\n", + "/viash_automount/tmp/viash-run-regression_1-lrY1xV.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " output = ad.AnnData(\n", + " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", + "0 0.004694 0.321477 0.163086\n", + "(3,) (3,)\n" + ] + } + ], + "source": [ + "import subprocess\n", + "import anndata as ad \n", + "import pandas as pd\n", + "import numpy as np\n", + "for cell_type in ['zhao', 'shalek', 'han', 'jackson']:\n", + " adata = ad.read_h5ad(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n", + " adata.layers['norm'] = adata.X\n", + " adata.obs['cell_type'] = 'onecelltype'\n", + " adata.write(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n", + " subsample = min([10000, len(adata)])\n", + " for GT in ['KDunion', 'chipunion', 'chipunion_KDUnion_intersect']:\n", + " GT_df = pd.read_csv(f'resources_local/mccalla_extended/{cell_type}_{GT}.csv')\n", + " gene_overlap = np.intersect1d(adata.var_names, GT_df.target.unique()).shape\n", + " print(f\"{cell_type}-{GT}. adata shape: {adata.shape}, GT size: {GT_df.shape}, Gene overlap: {gene_overlap}\")\n", + " command = f\"viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources_local/mccalla_extended/{cell_type}.h5ad --prediction resources_local/mccalla_extended/{cell_type}_{GT}.csv --layer norm --subsample {subsample} --apply_tf false --tf_all resources/prior/tf_all.csv --max_n_links -1 --verbose 1 --score output/{cell_type}_{GT}.h5ad\"\n", + " subprocess.run(command, shell=True, check=True)" + ] }, { "cell_type": "markdown", @@ -346,7 +457,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# To be categorized" + "# Metacell" ] }, { diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 8d6e6b453..cbc7f7d65 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -62,6 +62,10 @@ functionality: - name: --max_n_links type: integer default: 50000 + - name: --verbose + type: integer + default: 2 + direction: input diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml index 25729eb86..7a66c3aab 100644 --- a/src/metrics/regression_1/config.vsh.yaml +++ b/src/metrics/regression_1/config.vsh.yaml @@ -18,6 +18,8 @@ functionality: - type: python_script path: script.py - path: main.py + - path: /src/utils/util.py + dest: util.py platforms: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index f2477c04e..c9c2323d3 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -9,16 +9,10 @@ import pandas as pd import anndata as ad from tqdm import tqdm - +from util import verbose_print, process_links, verbose_tqdm import os -def select_top_links(net, par): - print("Number of links reduced to ", par['max_n_links']) - net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) - net = net_sorted.head(par['max_n_links']).reset_index(drop=True) - return net - class lightgbm_wrapper: def __init__(self, params, max_workers=None): self.params = params @@ -117,10 +111,9 @@ def cross_validation(net, prturb_adata, par:dict): feature_fraction=0.05, verbosity=-1) regr = lightgbm_wrapper(params, max_workers) else: - print(f'{reg_type} is not defined') - raise ValueError("define first") + raise ValueError(f"{reg_type} is not defined.") - for group in tqdm(unique_groups, desc="Processing groups"): + for group in verbose_tqdm(unique_groups, "Processing groups", 2, par['verbose']): mask_va = groups == group mask_tr = ~mask_va X_tr = X[mask_tr & mask_shared_genes, :] @@ -144,7 +137,7 @@ def regression_1( cell_types = prturb_adata.obs.cell_type.unique() score_list = [] for cell_type in cell_types: - print(f'----cross validate for {cell_type}----') + verbose_print(par['verbose'], f'----cross validate for {cell_type}----', 2) # check if net is cell type specific if 'cell_type' in net.columns: if cell_type not in net.cell_type.unique(): @@ -152,8 +145,9 @@ def regression_1( net_sub = net[net.cell_type==cell_type] else: net_sub = net - if len(net_sub)>par['max_n_links']: - net_sub = select_top_links(net_sub, par) #only top links are considered + if (len(net_sub)>par['max_n_links']) and (par['max_n_links']!=-1): + net_sub = process_links(net_sub, par) #only top links are considered + verbose_print(par['verbose'], f"Number of links reduced to {par['max_n_links']}", 2) prturb_adata_sub = prturb_adata[prturb_adata.obs.cell_type==cell_type,:] y_true_sub, y_pred_sub = cross_validation(net_sub, prturb_adata_sub, par) @@ -201,7 +195,7 @@ def main(par): manipulate = None ## read and process input data - print('Reading input files', flush=True) + verbose_print(par['verbose'], 'Reading input files', 3) perturbation_data = ad.read_h5ad(par['perturbation_data']) tf_all = np.loadtxt(par['tf_all'], dtype=str) @@ -236,14 +230,13 @@ def main(par): perturbation_data = perturbation_data[mask,:] else: perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :] - - print(perturbation_data.shape) - + verbose_print(par['verbose'], perturbation_data.shape,4) perturbation_data.X = perturbation_data.layers[layer] - print(f'Compute metrics for layer: {layer}', flush=True) + verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) + tfs_cases = [-1] if par['min_tf']: tfs_cases += 30 @@ -253,7 +246,7 @@ def main(par): par['exclude_missing_genes'] = exclude_missing_genes par['tf_n'] = tf_n run_key = f'ex({exclude_missing_genes})_tf({tf_n})' - print(run_key) + verbose_print(par['verbose'], f'Run for metric: {run_key}', 3) output = regression_1(net, perturbation_data, par=par, max_workers=max_workers) diff --git a/src/utils/util.py b/src/utils/util.py index e3e967e16..d8c395edc 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -5,6 +5,14 @@ from sklearn.preprocessing import StandardScaler import scipy.sparse as sp +def verbose_print(verbose_level, message, level): + if level <= verbose_level: + print(message) +def verbose_tqdm(iterable, desc, level, verbose_level): + if level <= verbose_level: + return tqdm(iterable, desc=desc) + else: + return iterable # Return the iterable without a progress bar def process_links(net, par):