From 916d854b0df3629c86769b495c90dcfb528e7559 Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Sat, 21 Sep 2024 12:09:27 +0200 Subject: [PATCH] overal score is added to runs --- runs.ipynb | 907 ++++++++++++++------ scripts/run_benchmark_all.sh | 11 +- src/control_methods/pearson/script.py | 1 - src/utils/util.py | 4 - src/workflows/run_benchmark/config.vsh.yaml | 5 +- src/workflows/run_benchmark/main.nf | 1 - 6 files changed, 668 insertions(+), 261 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index 26833540d..5ade52ba0 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -25,11 +25,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "upload: resources/grn_models/default/negative_control.csv to s3://openproblems-data/resources/grn/grn_models/default/negative_control.csv\n", + "upload: resources/grn_models/d0_hvgs/negative_control.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/negative_control.csv\n", + "upload: resources/results/d0_hvgs/trace.txt to s3://openproblems-data/resources/grn/results/d0_hvgs/trace.txt\n", + "delete: s3://openproblems-data/resources/grn/results/d0_hvgs_baseline/trace.txt\n" + ] + } + ], "source": [ - "\n", "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark --delete\n", "!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n", "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", @@ -135,105 +145,26 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 54, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "zhao-KDunion. adata shape: (36199, 8442), GT size: (105136, 3), Gene overlap: (8425,)\n", - "/viash_automount/tmp/viash-run-regression_1-8RZIHd.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.020129 0.03817 0.009021\n", - "(3,) (3,)\n", - "zhao-chipunion. adata shape: (36199, 8442), GT size: (60662, 3), Gene overlap: (8378,)\n", - "/viash_automount/tmp/viash-run-regression_1-rLuWXt.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.006815 0.064118 0.028652\n", - "(3,) (3,)\n", - "zhao-chipunion_KDUnion_intersect. adata shape: (36199, 8442), GT size: (9019, 3), Gene overlap: (4493,)\n", - "/viash_automount/tmp/viash-run-regression_1-nsE1Jg.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.004249 0.000488 -0.00188\n", - "(3,) (3,)\n", - "shalek-KDunion. adata shape: (1211, 9411), GT size: (148047, 3), Gene overlap: (8784,)\n", - "/viash_automount/tmp/viash-run-regression_1-54532G.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.010243 0.104809 0.047283\n", - "(3,) (3,)\n", - "shalek-chipunion. adata shape: (1211, 9411), GT size: (96383, 3), Gene overlap: (8670,)\n", - "/viash_automount/tmp/viash-run-regression_1-9tO5h1.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.003727 0.042922 0.019598\n", - "(3,) (3,)\n", - "shalek-chipunion_KDUnion_intersect. adata shape: (1211, 9411), GT size: (67705, 3), Gene overlap: (7852,)\n", - "/viash_automount/tmp/viash-run-regression_1-5SN7ZR.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.002252 0.094459 0.046103\n", - "(3,) (3,)\n", - "han-KDunion. adata shape: (5520, 7465), GT size: (77400, 3), Gene overlap: (7387,)\n", - "/viash_automount/tmp/viash-run-regression_1-eSKe1T.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.019109 0.031554 0.006222\n", - "(3,) (3,)\n", - "han-chipunion. adata shape: (5520, 7465), GT size: (160038, 3), Gene overlap: (7458,)\n", - "/viash_automount/tmp/viash-run-regression_1-VnCHsA.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.013187 0.071785 0.029299\n", - "(3,) (3,)\n", - "han-chipunion_KDUnion_intersect. adata shape: (5520, 7465), GT size: (8463, 3), Gene overlap: (4141,)\n", - "/viash_automount/tmp/viash-run-regression_1-0OpbTu.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.004217 0.004052 -0.000082\n", - "(3,) (3,)\n", - "jackson-KDunion. adata shape: (17396, 5736), GT size: (27433, 3), Gene overlap: (4785,)\n", - "/viash_automount/tmp/viash-run-regression_1-wvATsv.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.078523 0.238183 0.07983\n", - "(3,) (3,)\n", - "jackson-chipunion. adata shape: (17396, 5736), GT size: (24481, 3), Gene overlap: (4898,)\n", - "/viash_automount/tmp/viash-run-regression_1-GudAWm.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 -0.027772 0.186608 0.079418\n", - "(3,) (3,)\n", - "jackson-chipunion_KDUnion_intersect. adata shape: (17396, 5736), GT size: (2661, 3), Gene overlap: (1515,)\n", - "/viash_automount/tmp/viash-run-regression_1-lrY1xV.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", - " output = ad.AnnData(\n", - " ex(False)_tf(-1) ex(True)_tf(-1) Mean\n", - "0 0.004694 0.321477 0.163086\n", - "(3,) (3,)\n" - ] - } - ], + "outputs": [], "source": [ - "import subprocess\n", - "import anndata as ad \n", - "import pandas as pd\n", - "import numpy as np\n", - "for cell_type in ['zhao', 'shalek', 'han', 'jackson']:\n", - " adata = ad.read_h5ad(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n", - " adata.layers['norm'] = adata.X\n", - " adata.obs['cell_type'] = 'onecelltype'\n", - " adata.write(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n", - " subsample = min([10000, len(adata)])\n", - " for GT in ['KDunion', 'chipunion', 'chipunion_KDUnion_intersect']:\n", - " GT_df = pd.read_csv(f'resources_local/mccalla_extended/{cell_type}_{GT}.csv')\n", - " gene_overlap = np.intersect1d(adata.var_names, GT_df.target.unique()).shape\n", - " print(f\"{cell_type}-{GT}. adata shape: {adata.shape}, GT size: {GT_df.shape}, Gene overlap: {gene_overlap}\")\n", - " command = f\"viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources_local/mccalla_extended/{cell_type}.h5ad --prediction resources_local/mccalla_extended/{cell_type}_{GT}.csv --layer norm --subsample {subsample} --apply_tf false --tf_all resources/prior/tf_all.csv --max_n_links -1 --verbose 1 --score output/{cell_type}_{GT}.h5ad\"\n", - " subprocess.run(command, shell=True, check=True)" + "# import subprocess\n", + "# import anndata as ad \n", + "# import pandas as pd\n", + "# import numpy as np\n", + "# for cell_type in ['zhao', 'shalek', 'han', 'jackson']:\n", + "# adata = ad.read_h5ad(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n", + "# adata.layers['norm'] = adata.X\n", + "# adata.obs['cell_type'] = 'onecelltype'\n", + "# adata.write(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n", + "# subsample = min([10000, len(adata)])\n", + "# for GT in ['KDunion', 'chipunion', 'chipunion_KDUnion_intersect']:\n", + "# GT_df = pd.read_csv(f'resources_local/mccalla_extended/{cell_type}_{GT}.csv')\n", + "# gene_overlap = np.intersect1d(adata.var_names, GT_df.target.unique()).shape\n", + "# print(f\"{cell_type}-{GT}. adata shape: {adata.shape}, GT size: {GT_df.shape}, Gene overlap: {gene_overlap}\")\n", + "# command = f\"viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources_local/mccalla_extended/{cell_type}.h5ad --prediction resources_local/mccalla_extended/{cell_type}_{GT}.csv --layer norm --subsample {subsample} --apply_tf false --tf_all resources/prior/tf_all.csv --max_n_links -1 --verbose 1 --score output/{cell_type}_{GT}.h5ad\"\n", + "# subprocess.run(command, shell=True, check=True)" ] }, { @@ -243,297 +174,312 @@ "# d0_hvgs" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scores" + ] + }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "methods = [ 'pearson_corr', 'pearson_causal', 'positive_control', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" + "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 33, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "download: s3://openproblems-data/resources/grn/results/d0_hvgs/scores.yaml to resources/results/d0_hvgs/scores.yaml\n", - "download: s3://openproblems-data/resources/grn/results/d0_hvgs/trace.txt to resources/results/d0_hvgs/trace.txt\n", - "download: s3://openproblems-data/resources/grn/results/d0_hvgs/metric_configs.yaml to resources/results/d0_hvgs/metric_configs.yaml\n", - "download: s3://openproblems-data/resources/grn/results/d0_hvgs/state.yaml to resources/results/d0_hvgs/state.yaml\n" - ] - }, { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5
pearson_corr0.2386640.5146120.5295020.524232collectrinannannannan
negative_controlnannannannan
pearson_causal0.3552560.5787530.7413280.560490positive_control0.4891470.6771550.6554070.574608
positive_control0.4891470.6771550.6554070.574608pearson_corr0.2386640.5146120.5295020.524232
portia0.1489410.2272480.4512560.518048pearson_causal0.3552560.5787530.7413280.560490
ppcor0.0228460.0941070.3966800.509874portia0.1489410.2272480.4512560.518048
genie30.3726410.4903570.7540730.576580ppcor0.0228460.0941070.3966800.509874
grnboost20.3810320.4598600.7818520.609075genie30.3726410.4903570.7540730.576580
scenic0.1543990.2065710.6008390.574294grnboost20.3810320.4598600.7818520.609075
scglue0.0783090.2388590.4486170.527076scenic0.1543990.2065710.6008390.574294
celloracle0.2168970.3114510.6395560.580147scglue0.0783090.2388590.4486170.527076
celloracle0.2168970.3114510.6395560.580147
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 23, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RUN_ID=\"d0_hvgs\" \n", - "df_all = process_data(RUN_ID, models_all=methods)\n", - "df_all.style.background_gradient()" + "df_scores = process_data(RUN_ID, models_all=methods)\n", + "df_scores.style.background_gradient()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Format resourcs used" + "## Format resourcs used" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n" ] }, @@ -650,7 +596,7 @@ "scglue 35.933720 " ] }, - "execution_count": 45, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -702,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -728,6 +674,7 @@ " \n", " %cpu\n", " peak_rss\n", + " peak_vmem\n", " rchar\n", " wchar\n", " duration\n", @@ -738,6 +685,7 @@ " celloracle\n", " 799.6\n", " 14.9\n", + " 35.4\n", " 18.0\n", " 86.1\n", " 1.472222\n", @@ -747,17 +695,16 @@ "" ], "text/plain": [ - " %cpu peak_rss rchar wchar duration\n", - "celloracle 799.6 14.9 18.0 86.1 1.472222" + " %cpu peak_rss peak_vmem rchar wchar duration\n", + "celloracle 799.6 14.9 35.4 18.0 86.1 1.472222" ] }, - "execution_count": 47, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\n", "# sequra runs\n", "base_dir = 'resources/results/d0_hvgs_res/'\n", "models = ['celloracle']\n", @@ -780,11 +727,12 @@ "\n", "\n", "for i, method in enumerate(models):\n", - " df = pd.read_csv(f'{base_dir}/{method}.txt', sep='\\t')[cols]\n", + " df = pd.read_csv(f'{base_dir}/{method}.txt', sep='\\t')\n", + " df = df[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n", " df = df.drop(1)\n", "\n", "\n", - " for col in cols:\n", + " for col in df.columns:\n", " if col=='%cpu':\n", " df[col] = df[col].str.replace('%', '').astype(float)\n", " elif col=='duration':\n", @@ -803,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -827,8 +775,8 @@ " \n", " \n", " \n", - " Peak memory\n", - " Duration\n", + " Peak memory (GB)\n", + " Duration (hour)\n", " \n", " \n", " \n", @@ -872,17 +820,17 @@ "" ], "text/plain": [ - " Peak memory Duration\n", - "celloracle 14.900000 1.472222\n", - "portia 46.943497 0.110556\n", - "grnboost2 3.067471 1.568056\n", - "scenic 30.356461 1.908056\n", - "genie3 13.105103 16.682500\n", - "ppcor 3.909119 0.556667\n", - "scglue 29.917423 4.380278" + " Peak memory (GB) Duration (hour)\n", + "celloracle 14.900000 1.472222\n", + "portia 46.943497 0.110556\n", + "grnboost2 3.067471 1.568056\n", + "scenic 30.356461 1.908056\n", + "genie3 13.105103 16.682500\n", + "ppcor 3.909119 0.556667\n", + "scglue 29.917423 4.380278" ] }, - "execution_count": 48, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -904,6 +852,479 @@ "df_res" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Merge scores with resources" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5overall_score
collectri0.0000000.0000000.0000000.0000000.000000
negative_control0.0000000.0000000.0000000.0000000.000000
positive_control1.0000001.0000000.8382740.9434100.945421
pearson_corr0.4879190.7599620.6772410.8607020.696456
pearson_causal0.7262770.8546830.9481690.9202310.862340
portia0.3044910.3355920.5771630.8505490.516949
ppcor0.0467050.1389740.5073590.8371290.382542
genie30.7618180.7241420.9644700.9466490.849270
grnboost20.7789740.6791051.0000001.0000000.864520
scenic0.3156500.3050560.7684810.9428950.583021
scglue0.1600930.3527380.5737870.8653710.487997
celloracle0.4434200.4599410.8180010.9525050.668467
\n", + "
" + ], + "text/plain": [ + " ex(False)_tf(-1) ex(True)_tf(-1) static-theta-0.0 \\\n", + "collectri 0.000000 0.000000 0.000000 \n", + "negative_control 0.000000 0.000000 0.000000 \n", + "positive_control 1.000000 1.000000 0.838274 \n", + "pearson_corr 0.487919 0.759962 0.677241 \n", + "pearson_causal 0.726277 0.854683 0.948169 \n", + "portia 0.304491 0.335592 0.577163 \n", + "ppcor 0.046705 0.138974 0.507359 \n", + "genie3 0.761818 0.724142 0.964470 \n", + "grnboost2 0.778974 0.679105 1.000000 \n", + "scenic 0.315650 0.305056 0.768481 \n", + "scglue 0.160093 0.352738 0.573787 \n", + "celloracle 0.443420 0.459941 0.818001 \n", + "\n", + " static-theta-0.5 overall_score \n", + "collectri 0.000000 0.000000 \n", + "negative_control 0.000000 0.000000 \n", + "positive_control 0.943410 0.945421 \n", + "pearson_corr 0.860702 0.696456 \n", + "pearson_causal 0.920231 0.862340 \n", + "portia 0.850549 0.516949 \n", + "ppcor 0.837129 0.382542 \n", + "genie3 0.946649 0.849270 \n", + "grnboost2 1.000000 0.864520 \n", + "scenic 0.942895 0.583021 \n", + "scglue 0.865371 0.487997 \n", + "celloracle 0.952505 0.668467 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create ranking \n", + "df_scores = df_scores.fillna(0)\n", + "df_scores[df_scores < 0]=0\n", + "df_scores = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", + "df_scores['overall_score'] = df_scores.mean(axis=1)\n", + "# df_scores['rank'] = df_scores.mean(axis=1).rank(ascending=False).astype(int)\n", + "\n", + "df_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Peak memory (GB)Duration (hour)ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5overall_score
collectri0.0000000.0000000.0000000.0000000.0000000.0000000.000000
negative_control0.0000000.0000000.0000000.0000000.0000000.0000000.000000
positive_control0.0000000.0000001.0000001.0000000.8382740.9434100.945421
pearson_corr0.0000000.0000000.4879190.7599620.6772410.8607020.696456
pearson_causal0.0000000.0000000.7262770.8546830.9481690.9202310.862340
portia46.9434970.1105560.3044910.3355920.5771630.8505490.516949
ppcor3.9091190.5566670.0467050.1389740.5073590.8371290.382542
genie313.10510316.6825000.7618180.7241420.9644700.9466490.849270
grnboost23.0674711.5680560.7789740.6791051.0000001.0000000.864520
scenic30.3564611.9080560.3156500.3050560.7684810.9428950.583021
scglue29.9174234.3802780.1600930.3527380.5737870.8653710.487997
celloracle14.9000001.4722220.4434200.4599410.8180010.9525050.668467
\n", + "
" + ], + "text/plain": [ + " Peak memory (GB) Duration (hour) ex(False)_tf(-1) \\\n", + "collectri 0.000000 0.000000 0.000000 \n", + "negative_control 0.000000 0.000000 0.000000 \n", + "positive_control 0.000000 0.000000 1.000000 \n", + "pearson_corr 0.000000 0.000000 0.487919 \n", + "pearson_causal 0.000000 0.000000 0.726277 \n", + "portia 46.943497 0.110556 0.304491 \n", + "ppcor 3.909119 0.556667 0.046705 \n", + "genie3 13.105103 16.682500 0.761818 \n", + "grnboost2 3.067471 1.568056 0.778974 \n", + "scenic 30.356461 1.908056 0.315650 \n", + "scglue 29.917423 4.380278 0.160093 \n", + "celloracle 14.900000 1.472222 0.443420 \n", + "\n", + " ex(True)_tf(-1) static-theta-0.0 static-theta-0.5 \\\n", + "collectri 0.000000 0.000000 0.000000 \n", + "negative_control 0.000000 0.000000 0.000000 \n", + "positive_control 1.000000 0.838274 0.943410 \n", + "pearson_corr 0.759962 0.677241 0.860702 \n", + "pearson_causal 0.854683 0.948169 0.920231 \n", + "portia 0.335592 0.577163 0.850549 \n", + "ppcor 0.138974 0.507359 0.837129 \n", + "genie3 0.724142 0.964470 0.946649 \n", + "grnboost2 0.679105 1.000000 1.000000 \n", + "scenic 0.305056 0.768481 0.942895 \n", + "scglue 0.352738 0.573787 0.865371 \n", + "celloracle 0.459941 0.818001 0.952505 \n", + "\n", + " overall_score \n", + "collectri 0.000000 \n", + "negative_control 0.000000 \n", + "positive_control 0.945421 \n", + "pearson_corr 0.696456 \n", + "pearson_causal 0.862340 \n", + "portia 0.516949 \n", + "ppcor 0.382542 \n", + "genie3 0.849270 \n", + "grnboost2 0.864520 \n", + "scenic 0.583021 \n", + "scglue 0.487997 \n", + "celloracle 0.668467 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_combined = pd.concat([df_res, df_scores], axis=1).fillna(0)\n", + "df_combined = df_combined.reindex(methods)\n", + "df_combined" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log\n", + " result = getattr(ufunc, method)(*inputs, **kwargs)\n", + "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log\n", + " result = getattr(ufunc, method)(*inputs, **kwargs)\n" + ] + } + ], + "source": [ + "summary_all_file = 'output/summary_d0_hvgs.tsv'\n", + "summary_plot = 'output/summary_d0_hvgs.pdf'\n", + "df_all = df_combined.copy()\n", + "# df_combined.index = df_combined.index.map(surragate_names)\n", + "\n", + "\n", + "df_all.index.name = 'method_name' \n", + "# df_all['mean_cpu_pct_scaled'] = df_all['%CPU']/df_all['%CPU'].max()\n", + "df_all['mean_peak_memory_log_scaled'] = np.log(df_all['Peak memory (GB)'])\n", + "# df_all['mean_peak_memory_str'] = [f\"{int(value/ 1E9)}\" for value in df_all['Peak memory']]\n", + "df_all['mean_peak_memory_str'] = df_all['Peak memory (GB)']\n", + "# df_all[\"mean_disk_read_log_scaled\"] = np.log(df_all['Disk read'])\n", + "# df_all[\"mean_disk_read_str\"] = [f\"{int(value/ 1E9)}\" for value in df_all['Disk read']]\n", + "# df_all[\"mean_disk_write_log_scaled\"] = np.log(df_all['Disk write'])\n", + "# df_all[\"mean_disk_write_str\"] = [f\"{int(value/ 1E9)}\" for value in df_all['Disk write']]\n", + "df_all[\"mean_duration_log_scaled\"] = np.log(df_all['Duration (hour)'])\n", + "# df_all[\"mean_duration_str\"] = [f\"{int(value/ (60*60))}\" for value in df_all['Duration']]\n", + "df_all[\"mean_duration_str\"] = df_all['Duration (hour)']\n", + "\n", + "\n", + "df_all = df_all.reset_index()\n", + "df_all.to_csv(summary_all_file, sep='\\t')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "── \u001b[1mAttaching packages\u001b[22m ─────────────────────────────────────── tidyverse 1.3.1 ──\n", + "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.3.6 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4\n", + "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.1.7 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.0.9\n", + "\u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.2.0 \u001b[32m✔\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n", + "\u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.2 \u001b[32m✔\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n", + "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────── tidyverse_conflicts() ──\n", + "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n", + "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n", + "\u001b[?25hError in library(funkyheatmap) : \n", + " there is no package called ‘funkyheatmap’\n", + "Execution halted\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "\n", + "!Rscript ../grn_benchmark/src/summary_figure.R {summary_all_file} {summary_plot}\n" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/scripts/run_benchmark_all.sh b/scripts/run_benchmark_all.sh index 44b382edf..86fb2fbc3 100644 --- a/scripts/run_benchmark_all.sh +++ b/scripts/run_benchmark_all.sh @@ -1,7 +1,6 @@ #!/bin/bash -# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -RUN_ID="celloracle_d0_hvg" +RUN_ID="d0_hvgs_baseline" # resources_dir="./resources/" resources_dir="s3://openproblems-data/resources/grn" publish_dir="${resources_dir}/results/${RUN_ID}" @@ -13,10 +12,7 @@ layer='scgen_pearson' metric_ids="[regression_1, regression_2]" cell_type_specific=false #for controls normalize=false -only_hvgs=true -# method_ids="[tigress, ennet, scsgl, pidc]" -# method_ids="[pearson_corr, pearson_causal, positive_control]" -method_ids="[celloracle]" +method_ids="[pearson_corr, pearson_causal, positive_control]" param_file="./params/${RUN_ID}.yaml" @@ -27,7 +23,7 @@ param_list: metric_ids: $metric_ids method_ids: $method_ids perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad - multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_0.h5ad + multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_0_hvgs.h5ad multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac_0.h5ad reg_type: $reg_type subsample: $subsample @@ -37,7 +33,6 @@ param_list: tf_all: ${resources_dir}/prior/tf_all.csv cell_type_specific: ${cell_type_specific} normalize: ${normalize} - only_hvgs: ${only_hvgs} output_state: "state.yaml" publish_dir: "$publish_dir" diff --git a/src/control_methods/pearson/script.py b/src/control_methods/pearson/script.py index dde473edd..1674312c1 100644 --- a/src/control_methods/pearson/script.py +++ b/src/control_methods/pearson/script.py @@ -7,7 +7,6 @@ 'max_n_links': 50000, 'prediction': 'resources/grn_models/donor_0_default/pearson.csv', "seed": 32, - 'only_hvgs': True, 'normalize': False } ## VIASH END diff --git a/src/utils/util.py b/src/utils/util.py index d8c395edc..9339483ce 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -50,10 +50,6 @@ def process_data(adata, par): adata.X = adata.X.toarray() # You can also use .todense(), but .toarray() gives a NumPy array directly else: print("adata.X is already dense.") - if par['only_hvgs']: - print('Subsetting data to hvgs') - adata = adata[:, adata.var.hvg_counts] - print('New dimension of data: ', adata.shape) def create_corr_net(par): print(par) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index d1604cc86..ed8361f9c 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -58,10 +58,7 @@ functionality: type: boolean required: false direction: input - - name: --only_hvgs - type: boolean - required: false - direction: input + - name: Outputs diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 4520e47bf..17ee2fb90 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -85,7 +85,6 @@ workflow run_wf { perturbation_data:"perturbation_data", cell_type_specific:"cell_type_specific", normalize:"normalize", - only_hvgs:"only_hvgs", num_workers:"num_workers" ],