diff --git a/batch_run.sh b/batch_run.sh
index 1b2cc8ecb..7c5544804 100644
--- a/batch_run.sh
+++ b/batch_run.sh
@@ -13,3 +13,6 @@
# sbatch --job-name=tigress_donor0_hvg scripts/sbatch/single_omics_R.sh tigress tigress
+
+
+# sbatch --job-name=tigress_donor0_hvg scripts/sbatch/calculate_scores.sh
\ No newline at end of file
diff --git a/runs.ipynb b/runs.ipynb
index 5ade52ba0..cd1c593f8 100644
--- a/runs.ipynb
+++ b/runs.ipynb
@@ -16,6 +16,26 @@
"# collectRI.to_csv(f'resources/grn_models/default/collectri.csv')"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "multiomics_atac_d0.h5ad multiomics_rna_d0.h5ad multiomics_rna.rds\n",
+ "multiomics_atac.h5ad\t multiomics_rna_d0_hvg.h5ad perturbation_data.h5ad\n",
+ "multiomics_atac.rds\t multiomics_rna.h5ad\n",
+ "multiomics_rna_0.h5ad\t multiomics_rna_hvg.h5ad\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls resources/grn-benchmark//"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -56,7 +76,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -66,6 +86,9 @@
"import numpy as np\n",
"import scanpy as sc \n",
"from src.exp_analysis.helper import plot_cumulative_density\n",
+ "import sys \n",
+ "sys.path.append('../')\n",
+ "from grn_benchmark.src.commons import surragate_names\n",
"def extract_data(data, reg='reg1', dataset_id='scgen_pearson'):\n",
" i = 0\n",
" for entry in data:\n",
@@ -120,32 +143,16 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 2,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "__MACOSX\t\t\t\t shalek.h5ad\n",
- "han.h5ad\t\t\t\t shalek_KDunion.csv\n",
- "han_KDunion.csv\t\t\t\t shalek_chipunion.csv\n",
- "han_chipunion.csv\t\t\t shalek_chipunion_KDUnion_intersect.csv\n",
- "han_chipunion_KDUnion_intersect.csv\t zhao.h5ad\n",
- "jackson.h5ad\t\t\t\t zhao_KDunion.csv\n",
- "jackson_KDunion.csv\t\t\t zhao_chipunion.csv\n",
- "jackson_chipunion.csv\t\t\t zhao_chipunion_KDUnion_intersect.csv\n",
- "jackson_chipunion_KDUnion_intersect.csv\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "!ls resources_local/mccalla_extended/"
+ "# !ls resources_local/mccalla_extended/"
]
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -175,281 +182,322 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
+ "outputs": [],
"source": [
- "## Scores"
+ "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']"
+ "# consensus: run this after updating method list\n",
+ "# !python src/metrics/regression_2/consensus/script.py #inside the method names and dir\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## GB"
]
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Submitted batch job 7746169\n"
+ ]
+ }
+ ],
+ "source": [
+ "!sbatch scripts/sbatch/calculate_scores.sh #includes both reg1 and 2. #inside the script, set the reg_type, read and write dirs, and methods"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
- "
\n",
+ "\n",
" \n",
" \n",
" | \n",
- " ex(False)_tf(-1) | \n",
- " ex(True)_tf(-1) | \n",
- " static-theta-0.0 | \n",
- " static-theta-0.5 | \n",
+ " S1 | \n",
+ " S2 | \n",
+ " static-theta-0.0 | \n",
+ " static-theta-0.5 | \n",
"
\n",
" \n",
" \n",
" \n",
- " collectri | \n",
- " nan | \n",
- " nan | \n",
- " nan | \n",
- " nan | \n",
+ " collectri | \n",
+ " -0.100238 | \n",
+ " -0.211182 | \n",
+ " 0.489316 | \n",
+ " 0.514896 | \n",
"
\n",
" \n",
- " negative_control | \n",
- " nan | \n",
- " nan | \n",
- " nan | \n",
- " nan | \n",
+ " negative_control | \n",
+ " -0.043795 | \n",
+ " -0.045561 | \n",
+ " 0.459080 | \n",
+ " 0.505002 | \n",
"
\n",
" \n",
- " positive_control | \n",
- " 0.489147 | \n",
- " 0.677155 | \n",
- " 0.655407 | \n",
- " 0.574608 | \n",
+ " positive_control | \n",
+ " 0.489147 | \n",
+ " 0.677155 | \n",
+ " 0.655407 | \n",
+ " 0.574608 | \n",
"
\n",
" \n",
- " pearson_corr | \n",
- " 0.238664 | \n",
- " 0.514612 | \n",
- " 0.529502 | \n",
- " 0.524232 | \n",
+ " pearson_corr | \n",
+ " 0.238664 | \n",
+ " 0.514612 | \n",
+ " 0.529502 | \n",
+ " 0.524232 | \n",
"
\n",
" \n",
- " pearson_causal | \n",
- " 0.355256 | \n",
- " 0.578753 | \n",
- " 0.741328 | \n",
- " 0.560490 | \n",
+ " pearson_causal | \n",
+ " 0.355256 | \n",
+ " 0.578753 | \n",
+ " 0.741328 | \n",
+ " 0.560490 | \n",
"
\n",
" \n",
- " portia | \n",
- " 0.148941 | \n",
- " 0.227248 | \n",
- " 0.451256 | \n",
- " 0.518048 | \n",
+ " portia | \n",
+ " 0.148941 | \n",
+ " 0.227248 | \n",
+ " 0.451256 | \n",
+ " 0.518048 | \n",
"
\n",
" \n",
- " ppcor | \n",
- " 0.022846 | \n",
- " 0.094107 | \n",
- " 0.396680 | \n",
- " 0.509874 | \n",
+ " ppcor | \n",
+ " 0.022846 | \n",
+ " 0.094107 | \n",
+ " 0.396680 | \n",
+ " 0.509874 | \n",
"
\n",
" \n",
- " genie3 | \n",
- " 0.372641 | \n",
- " 0.490357 | \n",
- " 0.754073 | \n",
- " 0.576580 | \n",
+ " genie3 | \n",
+ " 0.372641 | \n",
+ " 0.490357 | \n",
+ " 0.754073 | \n",
+ " 0.576580 | \n",
"
\n",
" \n",
- " grnboost2 | \n",
- " 0.381032 | \n",
- " 0.459860 | \n",
- " 0.781852 | \n",
- " 0.609075 | \n",
+ " grnboost2 | \n",
+ " 0.381032 | \n",
+ " 0.459860 | \n",
+ " 0.781852 | \n",
+ " 0.609075 | \n",
"
\n",
" \n",
- " scenic | \n",
- " 0.154399 | \n",
- " 0.206571 | \n",
- " 0.600839 | \n",
- " 0.574294 | \n",
+ " scenic | \n",
+ " 0.147553 | \n",
+ " 0.214694 | \n",
+ " 0.600839 | \n",
+ " 0.574294 | \n",
"
\n",
" \n",
- " scglue | \n",
- " 0.078309 | \n",
- " 0.238859 | \n",
- " 0.448617 | \n",
- " 0.527076 | \n",
+ " scglue | \n",
+ " 0.078309 | \n",
+ " 0.238859 | \n",
+ " 0.448617 | \n",
+ " 0.527076 | \n",
"
\n",
" \n",
- " celloracle | \n",
- " 0.216897 | \n",
- " 0.311451 | \n",
- " 0.639556 | \n",
- " 0.580147 | \n",
+ " celloracle | \n",
+ " 0.216897 | \n",
+ " 0.311451 | \n",
+ " 0.639556 | \n",
+ " 0.580147 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 33,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "RUN_ID=\"d0_hvgs\" \n",
- "df_scores = process_data(RUN_ID, models_all=methods)\n",
+ "df_scores = pd.read_csv('resources/results/scores/ridge.csv', index_col=0)\n",
"df_scores.style.background_gradient()"
]
},
@@ -460,26 +508,33 @@
"## Format resourcs used"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### local runs"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
" df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
- "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
" df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
- "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
" df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
- "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
" df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
- "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
" df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
- "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
" df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n"
]
},
@@ -596,15 +651,15 @@
"scglue 35.933720 "
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# extract trace for local jobs\n",
+ "# extract trace_seqerafor local jobs\n",
"job_ids = {\n",
- " 'portia': 7742241,\n",
+ " 'portia': 7744548,\n",
" 'grnboost2': 7742249,\n",
" 'scenic': 7742283,\n",
" 'genie3': 7742285,\n",
@@ -646,11 +701,28 @@
"df_local"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### sequra runs: multiple runs"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 7,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2779943256.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+ " trace = trace.groupby('model').apply(lambda df: df.sort_values(by='duration', ascending=False).iloc[0])[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n",
+ "/vol/tmp/users/jnourisa/ipykernel_1858346/2779943256.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+ " trace = trace.groupby('model').apply(lambda df: df.sort_values(by='duration', ascending=False).iloc[0])[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -679,15 +751,51 @@
" wchar | \n",
" duration | \n",
" \n",
+ " \n",
+ " model | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
" \n",
" \n",
" \n",
+ " pearson_causal | \n",
+ " 515.0 | \n",
+ " 0.9747 | \n",
+ " 4.0 | \n",
+ " 0.0659 | \n",
+ " 0.0014 | \n",
+ " 0.064167 | \n",
+ "
\n",
+ " \n",
+ " pearson_corr | \n",
+ " 528.2 | \n",
+ " 0.9751 | \n",
+ " 4.0 | \n",
+ " 0.0659 | \n",
+ " 0.0014 | \n",
+ " 0.072500 | \n",
+ "
\n",
+ " \n",
+ " positive_control | \n",
+ " 200.1 | \n",
+ " 4.9000 | \n",
+ " 8.0 | \n",
+ " 2.3000 | \n",
+ " 0.0018 | \n",
+ " 0.075278 | \n",
+ "
\n",
+ " \n",
" celloracle | \n",
" 799.6 | \n",
- " 14.9 | \n",
+ " 14.9000 | \n",
" 35.4 | \n",
- " 18.0 | \n",
- " 86.1 | \n",
+ " 18.0000 | \n",
+ " 86.1000 | \n",
" 1.472222 | \n",
"
\n",
" \n",
@@ -695,63 +803,84 @@
""
],
"text/plain": [
- " %cpu peak_rss peak_vmem rchar wchar duration\n",
- "celloracle 799.6 14.9 35.4 18.0 86.1 1.472222"
+ " %cpu peak_rss peak_vmem rchar wchar duration\n",
+ "model \n",
+ "pearson_causal 515.0 0.9747 4.0 0.0659 0.0014 0.064167\n",
+ "pearson_corr 528.2 0.9751 4.0 0.0659 0.0014 0.072500\n",
+ "positive_control 200.1 4.9000 8.0 2.3000 0.0018 0.075278\n",
+ "celloracle 799.6 14.9000 35.4 18.0000 86.1000 1.472222"
]
},
- "execution_count": 10,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# sequra runs\n",
- "base_dir = 'resources/results/d0_hvgs_res/'\n",
- "models = ['celloracle']\n",
- "\n",
- "def convert_duration_to_hours(duration_str):\n",
- " import re\n",
- " hours, minutes, seconds = 0, 0, 0\n",
- " time_parts = re.findall(r'(\\d+)([hms])', duration_str)\n",
- " for value, unit in time_parts:\n",
- " if unit == 'h':\n",
- " hours = int(value)\n",
- " elif unit == 'm':\n",
- " minutes = int(value)\n",
- " elif unit == 's':\n",
- " seconds = int(value)\n",
- " return (hours * 3600 + minutes * 60 + seconds)/3600\n",
- "def format_gb(gb_str):\n",
- " gb_value = float(gb_str.split()[0])\n",
- " return gb_value \n",
"\n",
+ "def process_trace(trace):\n",
+ " trace['model'] = pd.DataFrame(trace.name.str.split(':').to_list())[3] #TODO: number 3 might be different \n",
+ " trace = trace[trace['model'].isin(models)]\n",
+ " trace = trace.groupby('model').apply(lambda df: df.sort_values(by='duration', ascending=False).iloc[0])[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n",
"\n",
- "for i, method in enumerate(models):\n",
- " df = pd.read_csv(f'{base_dir}/{method}.txt', sep='\\t')\n",
- " df = df[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n",
- " df = df.drop(1)\n",
"\n",
+ " def convert_duration_to_hours(duration_str):\n",
+ " import re\n",
+ " hours, minutes, seconds = 0, 0, 0\n",
+ " time_parts = re.findall(r'(\\d+)([hms])', duration_str)\n",
+ " for value, unit in time_parts:\n",
+ " if unit == 'h':\n",
+ " hours = int(value)\n",
+ " elif unit == 'm':\n",
+ " minutes = int(value)\n",
+ " elif unit == 's':\n",
+ " seconds = int(value)\n",
+ " return (hours * 3600 + minutes * 60 + seconds)/3600\n",
+ " def format_ram(row):\n",
+ " value = float(row.split()[0])\n",
+ " if 'GB' in row:\n",
+ " value = value\n",
+ " elif 'MB' in row:\n",
+ " value = value/1000\n",
+ " else:\n",
+ " raise ValueError('Define')\n",
+ " return value \n",
"\n",
- " for col in df.columns:\n",
+ " for col in trace.columns:\n",
" if col=='%cpu':\n",
- " df[col] = df[col].str.replace('%', '').astype(float)\n",
+ " trace[col] = trace[col].str.replace('%', '').astype(float)\n",
" elif col=='duration':\n",
- " df[col] = df[col].apply(convert_duration_to_hours)\n",
+ " trace[col] = trace[col].apply(convert_duration_to_hours)\n",
" else:\n",
- " df[col] = df[col].apply(format_gb)\n",
- " df = df.sort_values(by='duration', ascending=False).iloc[[0]]\n",
- " \n",
- " df.index = [method]\n",
- " if i==0:\n",
- " df_sequera = df \n",
- " else:\n",
- " df_sequera = pd.concat([df_sequera, df], axis=0)\n",
- "df_sequera"
+ " trace[col] = trace[col].apply(format_ram)\n",
+ " return trace \n",
+ "\n",
+ "models = ['pearson_causal', 'pearson_corr', 'positive_control']\n",
+ "base_dir = 'resources/results/d0_hvgs_baseline'\n",
+ "trace = pd.read_csv(f'{base_dir}/trace.txt', sep='\\t')\n",
+ "\n",
+ "trace_baselines = process_trace(trace)\n",
+ "\n",
+ "models = ['celloracle']\n",
+ "base_dir = 'resources/results/celloracle_d0_hvgs'\n",
+ "trace = pd.read_csv(f'{base_dir}/trace.txt', sep='\\t')\n",
+ "trace_models = process_trace(trace)\n",
+ "\n",
+ "\n",
+ "trace_seqera = pd.concat([trace_baselines, trace_models], axis=0)\n",
+ "trace_seqera"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### merge local and cluster dfs "
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -781,6 +910,21 @@
" \n",
" \n",
" \n",
+ " pearson_causal | \n",
+ " 0.974700 | \n",
+ " 0.064167 | \n",
+ "
\n",
+ " \n",
+ " pearson_corr | \n",
+ " 0.975100 | \n",
+ " 0.072500 | \n",
+ "
\n",
+ " \n",
+ " positive_control | \n",
+ " 4.900000 | \n",
+ " 0.075278 | \n",
+ "
\n",
+ " \n",
" celloracle | \n",
" 14.900000 | \n",
" 1.472222 | \n",
@@ -820,17 +964,20 @@
""
],
"text/plain": [
- " Peak memory (GB) Duration (hour)\n",
- "celloracle 14.900000 1.472222\n",
- "portia 46.943497 0.110556\n",
- "grnboost2 3.067471 1.568056\n",
- "scenic 30.356461 1.908056\n",
- "genie3 13.105103 16.682500\n",
- "ppcor 3.909119 0.556667\n",
- "scglue 29.917423 4.380278"
+ " Peak memory (GB) Duration (hour)\n",
+ "pearson_causal 0.974700 0.064167\n",
+ "pearson_corr 0.975100 0.072500\n",
+ "positive_control 4.900000 0.075278\n",
+ "celloracle 14.900000 1.472222\n",
+ "portia 46.943497 0.110556\n",
+ "grnboost2 3.067471 1.568056\n",
+ "scenic 30.356461 1.908056\n",
+ "genie3 13.105103 16.682500\n",
+ "ppcor 3.909119 0.556667\n",
+ "scglue 29.917423 4.380278"
]
},
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -840,12 +987,12 @@
"map_dict = {'peak_rss':'MaxRSS', 'duration':'Elapsed'}\n",
"\n",
"df_local = df_local[map_dict.values()]\n",
- "df_sequera = df_sequera[map_dict.keys()]\n",
+ "trace_seqera = trace_seqera[map_dict.keys()]\n",
"\n",
- "df_sequera.columns = df_sequera.columns.map(map_dict)\n",
+ "trace_seqera.columns = trace_seqera.columns.map(map_dict)\n",
"\n",
"\n",
- "df_res = pd.concat([df_sequera, df_local], axis=0)\n",
+ "df_res = pd.concat([trace_seqera, df_local], axis=0)\n",
"\n",
"df_res.columns = ['Peak memory (GB)', 'Duration (hour)']\n",
"\n",
@@ -861,7 +1008,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
@@ -885,145 +1032,184 @@
" \n",
" \n",
" | \n",
+ " method_name | \n",
" ex(False)_tf(-1) | \n",
" ex(True)_tf(-1) | \n",
" static-theta-0.0 | \n",
" static-theta-0.5 | \n",
" overall_score | \n",
+ " Peak memory (GB) | \n",
+ " Duration (hour) | \n",
"
\n",
" \n",
"
\n",
" \n",
- " collectri | \n",
- " 0.000000 | \n",
+ " 0 | \n",
+ " collectri | \n",
" 0.000000 | \n",
" 0.000000 | \n",
+ " 0.240505 | \n",
+ " 0.095068 | \n",
+ " 0.077646 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
- " negative_control | \n",
+ " 1 | \n",
+ " negative_control | \n",
" 0.000000 | \n",
" 0.000000 | \n",
+ " 0.162005 | \n",
" 0.000000 | \n",
+ " 0.032401 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
- " positive_control | \n",
+ " 2 | \n",
+ " positive_control | \n",
" 1.000000 | \n",
" 1.000000 | \n",
- " 0.838274 | \n",
- " 0.943410 | \n",
- " 0.945421 | \n",
+ " 0.671716 | \n",
+ " 0.668815 | \n",
+ " 0.860974 | \n",
+ " 4.900000 | \n",
+ " 0.075278 | \n",
"
\n",
" \n",
- " pearson_corr | \n",
+ " 3 | \n",
+ " pearson_corr | \n",
" 0.487919 | \n",
" 0.759962 | \n",
- " 0.677241 | \n",
- " 0.860702 | \n",
- " 0.696456 | \n",
+ " 0.344838 | \n",
+ " 0.184772 | \n",
+ " 0.453523 | \n",
+ " 0.975100 | \n",
+ " 0.072500 | \n",
"
\n",
" \n",
- " pearson_causal | \n",
+ " 4 | \n",
+ " pearson_causal | \n",
" 0.726277 | \n",
" 0.854683 | \n",
- " 0.948169 | \n",
- " 0.920231 | \n",
- " 0.862340 | \n",
+ " 0.894789 | \n",
+ " 0.533161 | \n",
+ " 0.774527 | \n",
+ " 0.974700 | \n",
+ " 0.064167 | \n",
"
\n",
" \n",
- " portia | \n",
+ " 5 | \n",
+ " portia | \n",
" 0.304491 | \n",
" 0.335592 | \n",
- " 0.577163 | \n",
- " 0.850549 | \n",
- " 0.516949 | \n",
+ " 0.141693 | \n",
+ " 0.125352 | \n",
+ " 0.226639 | \n",
+ " 46.943497 | \n",
+ " 0.110556 | \n",
"
\n",
" \n",
- " ppcor | \n",
+ " 6 | \n",
+ " ppcor | \n",
" 0.046705 | \n",
" 0.138974 | \n",
- " 0.507359 | \n",
- " 0.837129 | \n",
- " 0.382542 | \n",
+ " 0.000000 | \n",
+ " 0.046814 | \n",
+ " 0.050776 | \n",
+ " 3.909119 | \n",
+ " 0.556667 | \n",
"
\n",
" \n",
- " genie3 | \n",
+ " 7 | \n",
+ " genie3 | \n",
" 0.761818 | \n",
" 0.724142 | \n",
- " 0.964470 | \n",
- " 0.946649 | \n",
- " 0.849270 | \n",
+ " 0.927879 | \n",
+ " 0.687766 | \n",
+ " 0.798691 | \n",
+ " 13.105103 | \n",
+ " 16.682500 | \n",
"
\n",
" \n",
- " grnboost2 | \n",
+ " 8 | \n",
+ " grnboost2 | \n",
" 0.778974 | \n",
" 0.679105 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
- " 0.864520 | \n",
+ " 0.891616 | \n",
+ " 3.067471 | \n",
+ " 1.568056 | \n",
"
\n",
" \n",
- " scenic | \n",
- " 0.315650 | \n",
+ " 9 | \n",
+ " scenic | \n",
+ " 0.295812 | \n",
" 0.305056 | \n",
- " 0.768481 | \n",
- " 0.942895 | \n",
- " 0.583021 | \n",
+ " 0.530045 | \n",
+ " 0.665799 | \n",
+ " 0.458534 | \n",
+ " 30.356461 | \n",
+ " 1.908056 | \n",
"
\n",
" \n",
- " scglue | \n",
+ " 10 | \n",
+ " scglue | \n",
" 0.160093 | \n",
" 0.352738 | \n",
- " 0.573787 | \n",
- " 0.865371 | \n",
- " 0.487997 | \n",
+ " 0.134840 | \n",
+ " 0.212101 | \n",
+ " 0.214294 | \n",
+ " 29.917423 | \n",
+ " 4.380278 | \n",
"
\n",
" \n",
- " celloracle | \n",
+ " 11 | \n",
+ " celloracle | \n",
" 0.443420 | \n",
" 0.459941 | \n",
- " 0.818001 | \n",
- " 0.952505 | \n",
- " 0.668467 | \n",
+ " 0.630565 | \n",
+ " 0.722041 | \n",
+ " 0.578251 | \n",
+ " 14.900000 | \n",
+ " 1.472222 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
- " ex(False)_tf(-1) ex(True)_tf(-1) static-theta-0.0 \\\n",
- "collectri 0.000000 0.000000 0.000000 \n",
- "negative_control 0.000000 0.000000 0.000000 \n",
- "positive_control 1.000000 1.000000 0.838274 \n",
- "pearson_corr 0.487919 0.759962 0.677241 \n",
- "pearson_causal 0.726277 0.854683 0.948169 \n",
- "portia 0.304491 0.335592 0.577163 \n",
- "ppcor 0.046705 0.138974 0.507359 \n",
- "genie3 0.761818 0.724142 0.964470 \n",
- "grnboost2 0.778974 0.679105 1.000000 \n",
- "scenic 0.315650 0.305056 0.768481 \n",
- "scglue 0.160093 0.352738 0.573787 \n",
- "celloracle 0.443420 0.459941 0.818001 \n",
+ " method_name ex(False)_tf(-1) ex(True)_tf(-1) static-theta-0.0 \\\n",
+ "0 collectri 0.000000 0.000000 0.240505 \n",
+ "1 negative_control 0.000000 0.000000 0.162005 \n",
+ "2 positive_control 1.000000 1.000000 0.671716 \n",
+ "3 pearson_corr 0.487919 0.759962 0.344838 \n",
+ "4 pearson_causal 0.726277 0.854683 0.894789 \n",
+ "5 portia 0.304491 0.335592 0.141693 \n",
+ "6 ppcor 0.046705 0.138974 0.000000 \n",
+ "7 genie3 0.761818 0.724142 0.927879 \n",
+ "8 grnboost2 0.778974 0.679105 1.000000 \n",
+ "9 scenic 0.295812 0.305056 0.530045 \n",
+ "10 scglue 0.160093 0.352738 0.134840 \n",
+ "11 celloracle 0.443420 0.459941 0.630565 \n",
"\n",
- " static-theta-0.5 overall_score \n",
- "collectri 0.000000 0.000000 \n",
- "negative_control 0.000000 0.000000 \n",
- "positive_control 0.943410 0.945421 \n",
- "pearson_corr 0.860702 0.696456 \n",
- "pearson_causal 0.920231 0.862340 \n",
- "portia 0.850549 0.516949 \n",
- "ppcor 0.837129 0.382542 \n",
- "genie3 0.946649 0.849270 \n",
- "grnboost2 1.000000 0.864520 \n",
- "scenic 0.942895 0.583021 \n",
- "scglue 0.865371 0.487997 \n",
- "celloracle 0.952505 0.668467 "
+ " static-theta-0.5 overall_score Peak memory (GB) Duration (hour) \n",
+ "0 0.095068 0.077646 0.000000 0.000000 \n",
+ "1 0.000000 0.032401 0.000000 0.000000 \n",
+ "2 0.668815 0.860974 4.900000 0.075278 \n",
+ "3 0.184772 0.453523 0.975100 0.072500 \n",
+ "4 0.533161 0.774527 0.974700 0.064167 \n",
+ "5 0.125352 0.226639 46.943497 0.110556 \n",
+ "6 0.046814 0.050776 3.909119 0.556667 \n",
+ "7 0.687766 0.798691 13.105103 16.682500 \n",
+ "8 1.000000 0.891616 3.067471 1.568056 \n",
+ "9 0.665799 0.458534 30.356461 1.908056 \n",
+ "10 0.212101 0.214294 29.917423 4.380278 \n",
+ "11 0.722041 0.578251 14.900000 1.472222 "
]
},
- "execution_count": 34,
+ "execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@@ -1036,269 +1222,24 @@
"df_scores['overall_score'] = df_scores.mean(axis=1)\n",
"# df_scores['rank'] = df_scores.mean(axis=1).rank(ascending=False).astype(int)\n",
"\n",
- "df_scores"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Peak memory (GB) | \n",
- " Duration (hour) | \n",
- " ex(False)_tf(-1) | \n",
- " ex(True)_tf(-1) | \n",
- " static-theta-0.0 | \n",
- " static-theta-0.5 | \n",
- " overall_score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " collectri | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " negative_control | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " positive_control | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0.838274 | \n",
- " 0.943410 | \n",
- " 0.945421 | \n",
- "
\n",
- " \n",
- " pearson_corr | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.487919 | \n",
- " 0.759962 | \n",
- " 0.677241 | \n",
- " 0.860702 | \n",
- " 0.696456 | \n",
- "
\n",
- " \n",
- " pearson_causal | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " 0.726277 | \n",
- " 0.854683 | \n",
- " 0.948169 | \n",
- " 0.920231 | \n",
- " 0.862340 | \n",
- "
\n",
- " \n",
- " portia | \n",
- " 46.943497 | \n",
- " 0.110556 | \n",
- " 0.304491 | \n",
- " 0.335592 | \n",
- " 0.577163 | \n",
- " 0.850549 | \n",
- " 0.516949 | \n",
- "
\n",
- " \n",
- " ppcor | \n",
- " 3.909119 | \n",
- " 0.556667 | \n",
- " 0.046705 | \n",
- " 0.138974 | \n",
- " 0.507359 | \n",
- " 0.837129 | \n",
- " 0.382542 | \n",
- "
\n",
- " \n",
- " genie3 | \n",
- " 13.105103 | \n",
- " 16.682500 | \n",
- " 0.761818 | \n",
- " 0.724142 | \n",
- " 0.964470 | \n",
- " 0.946649 | \n",
- " 0.849270 | \n",
- "
\n",
- " \n",
- " grnboost2 | \n",
- " 3.067471 | \n",
- " 1.568056 | \n",
- " 0.778974 | \n",
- " 0.679105 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0.864520 | \n",
- "
\n",
- " \n",
- " scenic | \n",
- " 30.356461 | \n",
- " 1.908056 | \n",
- " 0.315650 | \n",
- " 0.305056 | \n",
- " 0.768481 | \n",
- " 0.942895 | \n",
- " 0.583021 | \n",
- "
\n",
- " \n",
- " scglue | \n",
- " 29.917423 | \n",
- " 4.380278 | \n",
- " 0.160093 | \n",
- " 0.352738 | \n",
- " 0.573787 | \n",
- " 0.865371 | \n",
- " 0.487997 | \n",
- "
\n",
- " \n",
- " celloracle | \n",
- " 14.900000 | \n",
- " 1.472222 | \n",
- " 0.443420 | \n",
- " 0.459941 | \n",
- " 0.818001 | \n",
- " 0.952505 | \n",
- " 0.668467 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Peak memory (GB) Duration (hour) ex(False)_tf(-1) \\\n",
- "collectri 0.000000 0.000000 0.000000 \n",
- "negative_control 0.000000 0.000000 0.000000 \n",
- "positive_control 0.000000 0.000000 1.000000 \n",
- "pearson_corr 0.000000 0.000000 0.487919 \n",
- "pearson_causal 0.000000 0.000000 0.726277 \n",
- "portia 46.943497 0.110556 0.304491 \n",
- "ppcor 3.909119 0.556667 0.046705 \n",
- "genie3 13.105103 16.682500 0.761818 \n",
- "grnboost2 3.067471 1.568056 0.778974 \n",
- "scenic 30.356461 1.908056 0.315650 \n",
- "scglue 29.917423 4.380278 0.160093 \n",
- "celloracle 14.900000 1.472222 0.443420 \n",
- "\n",
- " ex(True)_tf(-1) static-theta-0.0 static-theta-0.5 \\\n",
- "collectri 0.000000 0.000000 0.000000 \n",
- "negative_control 0.000000 0.000000 0.000000 \n",
- "positive_control 1.000000 0.838274 0.943410 \n",
- "pearson_corr 0.759962 0.677241 0.860702 \n",
- "pearson_causal 0.854683 0.948169 0.920231 \n",
- "portia 0.335592 0.577163 0.850549 \n",
- "ppcor 0.138974 0.507359 0.837129 \n",
- "genie3 0.724142 0.964470 0.946649 \n",
- "grnboost2 0.679105 1.000000 1.000000 \n",
- "scenic 0.305056 0.768481 0.942895 \n",
- "scglue 0.352738 0.573787 0.865371 \n",
- "celloracle 0.459941 0.818001 0.952505 \n",
- "\n",
- " overall_score \n",
- "collectri 0.000000 \n",
- "negative_control 0.000000 \n",
- "positive_control 0.945421 \n",
- "pearson_corr 0.696456 \n",
- "pearson_causal 0.862340 \n",
- "portia 0.516949 \n",
- "ppcor 0.382542 \n",
- "genie3 0.849270 \n",
- "grnboost2 0.864520 \n",
- "scenic 0.583021 \n",
- "scglue 0.487997 \n",
- "celloracle 0.668467 "
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_combined = pd.concat([df_res, df_scores], axis=1).fillna(0)\n",
- "df_combined = df_combined.reindex(methods)\n",
- "df_combined"
+ "df_all = pd.concat([df_scores, df_res], axis=1)\n",
+ "df_all = df_all.fillna(0)\n",
+ "df_all.index.name = 'method_name' \n",
+ "df_all = df_all.reset_index()\n",
+ "\n",
+ "df_all \n"
]
},
{
- "cell_type": "code",
- "execution_count": 36,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log\n",
- " result = getattr(ufunc, method)(*inputs, **kwargs)\n",
- "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log\n",
- " result = getattr(ufunc, method)(*inputs, **kwargs)\n"
- ]
- }
- ],
"source": [
- "summary_all_file = 'output/summary_d0_hvgs.tsv'\n",
- "summary_plot = 'output/summary_d0_hvgs.pdf'\n",
- "df_all = df_combined.copy()\n",
- "# df_combined.index = df_combined.index.map(surragate_names)\n",
- "\n",
- "\n",
- "df_all.index.name = 'method_name' \n",
- "# df_all['mean_cpu_pct_scaled'] = df_all['%CPU']/df_all['%CPU'].max()\n",
- "df_all['mean_peak_memory_log_scaled'] = np.log(df_all['Peak memory (GB)'])\n",
- "# df_all['mean_peak_memory_str'] = [f\"{int(value/ 1E9)}\" for value in df_all['Peak memory']]\n",
- "df_all['mean_peak_memory_str'] = df_all['Peak memory (GB)']\n",
- "# df_all[\"mean_disk_read_log_scaled\"] = np.log(df_all['Disk read'])\n",
- "# df_all[\"mean_disk_read_str\"] = [f\"{int(value/ 1E9)}\" for value in df_all['Disk read']]\n",
- "# df_all[\"mean_disk_write_log_scaled\"] = np.log(df_all['Disk write'])\n",
- "# df_all[\"mean_disk_write_str\"] = [f\"{int(value/ 1E9)}\" for value in df_all['Disk write']]\n",
- "df_all[\"mean_duration_log_scaled\"] = np.log(df_all['Duration (hour)'])\n",
- "# df_all[\"mean_duration_str\"] = [f\"{int(value/ (60*60))}\" for value in df_all['Duration']]\n",
- "df_all[\"mean_duration_str\"] = df_all['Duration (hour)']\n",
- "\n",
- "\n",
- "df_all = df_all.reset_index()\n",
- "df_all.to_csv(summary_all_file, sep='\\t')\n",
- "\n"
+ "## Summary figure"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 49,
"metadata": {},
"outputs": [
{
@@ -1306,23 +1247,68 @@
"output_type": "stream",
"text": [
"── \u001b[1mAttaching packages\u001b[22m ─────────────────────────────────────── tidyverse 1.3.1 ──\n",
- "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.3.6 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4\n",
- "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.1.7 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.0.9\n",
+ "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.5.1 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4\n",
+ "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.2.1 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4\n",
"\u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.2.0 \u001b[32m✔\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n",
"\u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.2 \u001b[32m✔\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n",
"── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────── tidyverse_conflicts() ──\n",
"\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
"\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n",
- "\u001b[?25hError in library(funkyheatmap) : \n",
- " there is no package called ‘funkyheatmap’\n",
- "Execution halted\n",
- "\u001b[?25h"
+ "\u001b[?25h\u001b[?25h\u001b[?25hWarning message:\n",
+ "\u001b[1m\u001b[22m`thisfile()` was deprecated in rprojroot 2.0.0.\n",
+ "\u001b[36mℹ\u001b[39m Please use `whereami::thisfile()` instead. \n",
+ "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[1m\u001b[22mNew names:\n",
+ "\u001b[36m•\u001b[39m `` -> `...1`\n",
+ "\u001b[1mRows: \u001b[22m\u001b[34m12\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m13\u001b[39m\n",
+ "\u001b[36m──\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m────────────────────────────────────────────────────────\u001b[39m\n",
+ "\u001b[1mDelimiter:\u001b[22m \"\\t\"\n",
+ "\u001b[31mchr\u001b[39m (1): method_name\n",
+ "\u001b[32mdbl\u001b[39m (12): ...1, ex(False)_tf(-1), ex(True)_tf(-1), static-theta-0.0, static-...\n",
+ "\n",
+ "\u001b[36mℹ\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n",
+ "\u001b[36mℹ\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n",
+ "\u001b[?25h\u001b[?25h\u001b[90m# A tibble: 10 × 7\u001b[39m\n",
+ " id id_color name group geom palette options \n",
+ " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \n",
+ "\u001b[90m 1\u001b[39m method_name \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mName\u001b[90m\"\u001b[39m meth… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n",
+ "\u001b[90m 2\u001b[39m overall_score overall_score \u001b[90m\"\u001b[39mScore\u001b[90m\"\u001b[39m over… bar overall \u001b[90m\u001b[39m\n",
+ "\u001b[90m 3\u001b[39m ex(False)_tf(-1) ex(False)_tf(-1) \u001b[90m\"\u001b[39mS11\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n",
+ "\u001b[90m 4\u001b[39m ex(True)_tf(-1) ex(True)_tf(-1) \u001b[90m\"\u001b[39mS12\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n",
+ "\u001b[90m 5\u001b[39m static-theta-0.0 static-theta-0.0 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n",
+ "\u001b[90m 6\u001b[39m static-theta-0.5 static-theta-0.5 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n",
+ "\u001b[90m 7\u001b[39m memory_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mPeak mem… reso… rect resour… \u001b[90m\u001b[39m\n",
+ "\u001b[90m 8\u001b[39m memory_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n",
+ "\u001b[90m 9\u001b[39m duration_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mDuration… reso… rect resour… \u001b[90m\u001b[39m\n",
+ "\u001b[90m10\u001b[39m duration_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n",
+ "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[36mℹ\u001b[39m Could not find column 'id' in data. Using rownames as 'id'.\n",
+ "\u001b[36mℹ\u001b[39m Column info did not contain a column called 'legend', generating options based on the 'geom' column.\n",
+ "\u001b[36mℹ\u001b[39m No row info was provided, assuming all rows in `data` are to be plotted.\n",
+ "\u001b[36mℹ\u001b[39m Row info did not contain group information, assuming rows are ungrouped.\n",
+ "\u001b[36mℹ\u001b[39m Legend 1 did not contain color, inferring from the palette.\n",
+ "\u001b[36mℹ\u001b[39m Legend 2 did not contain color, inferring from the palette.\n",
+ "\u001b[?25h\u001b[?25h\u001b[?25h"
]
}
],
"source": [
"\n",
- "!Rscript ../grn_benchmark/src/summary_figure.R {summary_all_file} {summary_plot}\n"
+ "summary_file = \"output/summary_d0_hvg.tsv\"\n",
+ "summary_figure = \"output/summary_d0_hvg_figure.pdf\"\n",
+ "\n",
+ "df_all['memory_log'] = np.log(df_all['Peak memory (GB)']+1)\n",
+ "df_all['memory_log'] = np.max(df_all['memory_log'])-df_all['memory_log']\n",
+ "\n",
+ "\n",
+ "df_all[\"duration_log\"] = np.log(df_all['Duration (hour)']+1)\n",
+ "df_all['duration_log'] = np.max(df_all['duration_log'])-df_all['duration_log']\n",
+ "\n",
+ "df_all[\"duration_str\"] = df_all['Duration (hour)'].round(1).astype(str)\n",
+ "df_all['memory_str'] = df_all['Peak memory (GB)'].round(1).astype(str)\n",
+ "\n",
+ "\n",
+ "df_all.to_csv(summary_file, sep='\\t')\n",
+ "\n",
+ "!Rscript ../grn_benchmark/src/metrics_figure.R {summary_file} {summary_figure}\n"
]
},
{
@@ -1334,27 +1320,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
- "if par['metacell']:\n",
- " print('metacell')\n",
- " def create_meta_cells(df, n_cells=15):\n",
- " meta_x = []\n",
- " for i in range(0, df.shape[0], n_cells):\n",
- " meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n",
- " df = pd.DataFrame(meta_x, columns=df.columns)\n",
- " return df\n",
+ "# if par['metacell']:\n",
+ "# print('metacell')\n",
+ "# def create_meta_cells(df, n_cells=15):\n",
+ "# meta_x = []\n",
+ "# for i in range(0, df.shape[0], n_cells):\n",
+ "# meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n",
+ "# df = pd.DataFrame(meta_x, columns=df.columns)\n",
+ "# return df\n",
" \n",
- " adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n",
- " adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n",
- " adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n",
- " df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n",
- " X = df.values\n",
- " var = pd.DataFrame(index=df.columns)\n",
- " obs = df.reset_index()[['cell_type','donor_id']]\n",
- " multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)"
+ "# adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n",
+ "# adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n",
+ "# adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n",
+ "# df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n",
+ "# X = df.values\n",
+ "# var = pd.DataFrame(index=df.columns)\n",
+ "# obs = df.reset_index()[['cell_type','donor_id']]\n",
+ "# multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)"
]
},
{
@@ -1392,7 +1378,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Shuffle net"
+ "## Permute net"
]
},
{
diff --git a/scripts/run_benchmark_all.sh b/scripts/run_benchmark_all.sh
index 86fb2fbc3..b27f63522 100644
--- a/scripts/run_benchmark_all.sh
+++ b/scripts/run_benchmark_all.sh
@@ -23,8 +23,8 @@ param_list:
metric_ids: $metric_ids
method_ids: $method_ids
perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
- multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_0_hvgs.h5ad
- multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac_0.h5ad
+ multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_d0_hvg.h5ad
+ multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac_d0.h5ad
reg_type: $reg_type
subsample: $subsample
num_workers: $num_workers
diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
index 878afe48c..b1bed3e56 100644
--- a/scripts/run_grn_evaluation.sh
+++ b/scripts/run_grn_evaluation.sh
@@ -47,7 +47,7 @@ grn_names=(
"collectri"
)
-
+echo $grn_names
# Start writing to the YAML file
cat > $param_file << HERE
param_list:
diff --git a/scripts/run_robust_analys.sh b/scripts/run_robust_analys.sh
index f8348c950..285dde911 100644
--- a/scripts/run_robust_analys.sh
+++ b/scripts/run_robust_analys.sh
@@ -1,18 +1,16 @@
#!/bin/bash
-# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-
degrees=(0 10 20 50 100)
noise_type="$1" #"net", "weight", "sign"
echo $noise_type
-RUN_ID="robust_analy_reg2_$1"
+RUN_ID="robust_analy_$1"
# resources_dir="resources"
resources_dir="s3://openproblems-data/resources/grn"
publish_dir="${resources_dir}/results/${RUN_ID}"
-grn_models_folder="${resources_dir}/grn_models"
+grn_models_folder="${resources_dir}/grn_models/d0_hvgs"
reg_type=ridge
@@ -22,12 +20,18 @@ max_workers=10
param_file="./params/${RUN_ID}.yaml"
grn_names=(
- "collectri"
- "celloracle"
- "scenicplus"
- "figr"
- "granie"
- "scglue"
+ 'collectri'
+ 'negative_control'
+ 'positive_control'
+ 'pearson_corr'
+ 'pearson_causal'
+ 'portia'
+ 'ppcor'
+ 'genie3'
+ 'grnboost2'
+ 'scenic'
+ 'scglue'
+ 'celloracle'
)
@@ -36,6 +40,7 @@ cat > $param_file << HERE
param_list:
HERE
+
append_entry() {
cat >> $param_file << HERE
- id: ${1}_${2}_${3}
diff --git a/scripts/sbatch/calculate_scores.sh b/scripts/sbatch/calculate_scores.sh
new file mode 100644
index 000000000..774d9fcab
--- /dev/null
+++ b/scripts/sbatch/calculate_scores.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+#SBATCH --job-name=calculate-scores
+#SBATCH --time=48:00:00
+#SBATCH --output=logs/%j.out
+#SBATCH --error=logs/%j.err
+#SBATCH --mail-type=END
+#SBATCH --mail-user=jalil.nourisa@gmail.com
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=20
+
+python src/metrics/regression_1/script_all.py
diff --git a/src/methods/single_omics/portia/script.py b/src/methods/single_omics/portia/script.py
index 6bdeeb435..d8cede44c 100644
--- a/src/methods/single_omics/portia/script.py
+++ b/src/methods/single_omics/portia/script.py
@@ -44,7 +44,7 @@ def infer_grn(X, gene_names):
dataset.add(pt.Experiment(exp_id, data))
M_bar = pt.run(dataset, method='no-transform')
- limit = min([10000000, X.size])
+ limit = min([500000, X.size])
ranked_scores = pt.rank_scores(M_bar, gene_names, limit=limit)
sources, targets, weights = zip(*[(gene_a, gene_b, score) for gene_a, gene_b, score in ranked_scores])
diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
index 6a1da25d4..9886bbc65 100644
--- a/src/metrics/regression_1/main.py
+++ b/src/metrics/regression_1/main.py
@@ -11,6 +11,10 @@
from tqdm import tqdm
from util import verbose_print, process_links, verbose_tqdm
import os
+import warnings
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+
class lightgbm_wrapper:
@@ -54,14 +58,6 @@ def cv_5(genes_n):
def cross_validation(net, prturb_adata, par:dict):
gene_names = prturb_adata.var_names
net = process_net(net.copy(), gene_names)
- net_subset = net.copy()
- # Subset TFs
- if par['tf_n'] == -1:
- degrees = net.abs().sum(axis=0)
- net = net.loc[:, degrees >= degrees.quantile((1 - par['theta']))]
- else:
- degrees = net.abs().sum(axis=0)
- net = net[degrees.nlargest(tf_n).index]
gene_names_grn = net.index.to_numpy()
@@ -105,11 +101,11 @@ def cross_validation(net, prturb_adata, par:dict):
n_estimators=100, min_samples_leaf=2, min_child_samples=1,
feature_fraction=0.05, verbosity=-1
)
- regr = lightgbm_wrapper(params, max_workers=max_workers)
+ regr = lightgbm_wrapper(params, max_workers=par['num_workers'])
elif reg_type=='RF':
params = dict(boosting_type='rf',random_state=32, n_estimators=100,
feature_fraction=0.05, verbosity=-1)
- regr = lightgbm_wrapper(params, max_workers)
+ regr = lightgbm_wrapper(params, max_workers=par['num_workers'])
else:
raise ValueError(f"{reg_type} is not defined.")
@@ -246,29 +242,11 @@ def main(par):
verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3)
- tfs_cases = [-1]
- if par['min_tf']:
- tfs_cases += 30
- layer_results = {} # Store results for this layer
- for exclude_missing_genes in [False, True]: # two settings on target gene
- for tf_n in tfs_cases: # two settings on tfs
- par['exclude_missing_genes'] = exclude_missing_genes
- par['tf_n'] = tf_n
- run_key = f'ex({exclude_missing_genes})_tf({tf_n})'
- verbose_print(par['verbose'], f'Run for metric: {run_key}', 3)
-
- output = regression_1(net, perturbation_data, par=par, max_workers=max_workers)
-
- layer_results[run_key] = [output['mean_score_r2']]
+ results = {}
+ par['exclude_missing_genes'] = False
+ results['S1'] = [regression_1(net, perturbation_data, par=par, max_workers=max_workers)['mean_score_r2']]
+ par['exclude_missing_genes'] = True
+ results['S2'] = [regression_1(net, perturbation_data, par=par, max_workers=max_workers)['mean_score_r2']]
- # Convert results to DataFrame
- df_results = pd.DataFrame(layer_results)
- # if par['min_tf']:
- # if 'ex(True)_tf(140)' not in df_results.columns:
- # df_results['ex(True)_tf(140)'] = df_results['ex(True)_tf(-1)']
- # if 'ex(False)_tf(140)' not in df_results.columns:
- # df_results['ex(False)_tf(140)'] = df_results['ex(False)_tf(-1)']
-
- df_results['Mean'] = df_results.mean(axis=1)
-
+ df_results = pd.DataFrame(results)
return df_results
\ No newline at end of file
diff --git a/src/metrics/regression_1/script_all.py b/src/metrics/regression_1/script_all.py
index 58e1d499b..d25c5faa1 100644
--- a/src/metrics/regression_1/script_all.py
+++ b/src/metrics/regression_1/script_all.py
@@ -3,95 +3,53 @@
import sys
import numpy as np
import os
-from tqdm import tqdm
-from sklearn.preprocessing import StandardScaler
+## VIASH START
par = {
- "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
- 'max_workers': 40,
'reg_type': 'ridge',
+ 'read_dir': "resources/grn_models/d0_hvgs",
+ 'write_dir': "resources/results/scores",
+ 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'],
+ 'layers': ['lognorm', 'pearson', 'scgen_lognorm', 'scgen_pearson'],
+
+ "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
+ "tf_all": "resources/prior/tf_all.csv",
+ "min_tf": False,
+ "max_n_links": 50000,
+ "apply_tf": "true",
'subsample': -2,
- "tf_all": "./resources/prior/tf_all.csv",
- "temp_dir": "output/ridge/noised"
+ 'num_workers': 4,
+ 'verbose': 1,
+ 'binarize': True,
+ 'num_workers': 20,
+ 'consensus': 'resources/prior/consensus-num-regulators.json',
+ 'static_only': True,
+ 'clip_scores': True
}
-grn_models_folder = 'resources/grn_models'
-grn_models_folder = 'resources/supplementary/grn_models_noised'
-
-def create_positive_control(X: np.ndarray, groups: np.ndarray):
- grns = []
- for group in tqdm(np.unique(groups), desc="Processing groups"):
- X_sub = X[groups == group, :]
- X_sub = StandardScaler().fit_transform(X_sub)
- grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
- grns.append(grn)
- return np.mean(grns, axis=0)
-def create_negative_control(gene_names) -> np.ndarray:
- ratio = [.98, .01, 0.01]
- n_tf = 400
- net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio)
- net = pd.DataFrame(net, index=gene_names)
- return net
-
-
-print('Reading input data')
-perturbation_data = ad.read_h5ad(par["perturbation_data"])
-gene_names = perturbation_data.var_names.to_numpy()
-tf_all = np.loadtxt(par['tf_all'], dtype=str)
-groups = perturbation_data.obs.cell_type
-
+# VIASH END
meta = {
- "resources_dir":'./'
+ "resources_dir": 'src/metrics/',
+ "util": 'src/utils'
}
sys.path.append(meta["resources_dir"])
-from main import main
-# layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson']
-layers = ['pearson']
-grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri']
-# controls = ['negative_control', 'positive_control']
-controls = []
-
-os.makedirs(par['temp_dir'], exist_ok=True)
-for grn_model in controls + grn_models :
- par["score"] = f"{par['temp_dir']}/reg1-{grn_model}.csv"
- for ii, layer in enumerate(layers):
- par["layer"] = layer
- if grn_model=='positive_control':
- print('Inferring positive control')
- net = create_positive_control(perturbation_data.layers[par["layer"]], groups)
-
- net = pd.DataFrame(net, index=gene_names, columns=gene_names)
- net = net.loc[:, net.columns.isin(tf_all)]
-
- pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
-
- pivoted_net = pivoted_net.rename(columns={'index': 'target'})
- pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
- par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv"
- print(par['prediction'])
- pivoted_net.to_csv(par['prediction'])
- elif grn_model=='negative_control':
- print('Inferring negative control')
- net = create_negative_control(gene_names)
-
- pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
-
- pivoted_net = pivoted_net.rename(columns={'index': 'target'})
- pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
- par['prediction'] = f"{par['temp_dir']}/negative_control.csv"
- pivoted_net.to_csv(par['prediction'])
+sys.path.append(meta["util"])
+
+os.makedirs(par['write_dir'], exist_ok=True)
+
+for layer in par['layers']:
+ par['layer'] = layer
+ for i, method in enumerate(par['methods']):
+ par['prediction'] = f"{par['read_dir']}/{method}.csv"
+ from regression_1.main import main
+ reg1 = main(par)
+ from regression_2.main import main
+ reg2 = main(par)
+ prediction = pd.concat([reg1, reg2], axis=1)
+ prediction.index = [method]
+ if i==0:
+ df_all = prediction
else:
- par['prediction'] = f"{grn_models_folder}/{grn_model}.csv"
- output = main(par)
- output.index = [layer]
-
- if ii == 0:
- score = output
- else:
- score = pd.concat([score, output], axis=0)
-
- print("Write output to file", flush=True)
- print(grn_model, layer, score)
-
- print("Write output to file", flush=True)
- score.to_csv(par['score'])
-
+ df_all = pd.concat([df_all, prediction])
+ df_all.to_csv(f"{par['write_dir']}/{layer}-{par['reg_type']}.csv")
+ print(df_all)
+
diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py
index 6a1c75ec3..075f29a13 100644
--- a/src/metrics/regression_2/consensus/script.py
+++ b/src/metrics/regression_2/consensus/script.py
@@ -11,8 +11,8 @@
## VIASH START
par = {
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
- 'grn_folder': 'resources/grn-benchmark/grn_models',
- 'grns': 'figr.csv,scenicplus.csv',
+ 'grn_folder': 'resources/grn-benchmark/grn_models/d0_hvg',
+ 'grns': 'pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle',
'output': 'resources/grn-benchmark/consensus-num-regulators.json'
}
## VIASH END
@@ -31,7 +31,7 @@
# Load inferred GRNs
grns = []
for filename in par['grns'].split(','):
- filepath = os.path.join(par['grn_folder'], filename)
+ filepath = os.path.join(par['grn_folder'], f'{filename}.csv')
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
index 352e63cd3..2a09166cc 100644
--- a/src/metrics/regression_2/main.py
+++ b/src/metrics/regression_2/main.py
@@ -11,23 +11,19 @@
from sklearn.model_selection import GroupKFold, LeaveOneGroupOut
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
+from util import verbose_print, process_links, verbose_tqdm
SEED = 0xCAFE
N_POINTS_TO_ESTIMATE_BACKGROUND = 20
-def select_top_links(net, par):
- print("Number of links reduced to ", par['max_n_links'])
- net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
- net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
- return net
def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray:
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
if 'cell_type' in df.columns:
- print('Taking mean of cell type specific grns')
+ verbose_print(par['verbose'], 'Taking mean of cell type specific grns', 3)
df.drop(columns=['cell_type'], inplace=True)
df = df.groupby(['source', 'target']).mean().reset_index()
@@ -38,8 +34,8 @@ def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.n
j = gene_dict[target]
A[i, j] = float(weight)
if df.shape[0] > par['max_n_links']:
- df = select_top_links(df, par)
- print(df)
+ verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3)
+ df = process_links(df, par)
return A
@@ -265,7 +261,7 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
random.seed(random_state)
lightgbm.LGBMRegressor().set_params(random_state=random_state)
- print('Reading input files', flush=True)
+ verbose_print(par['verbose'], "Reading input files", 3)
# Load perturbation data
perturbation_data = ad.read_h5ad(par['perturbation_data'])
@@ -287,8 +283,7 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
groups = LabelEncoder().fit_transform(perturbation_data.obs.plate_name)
- # Load inferred GRN
- print(f'Loading GRN', flush=True)
+
grn = load_grn(par['prediction'], gene_names, par)
# Load and standardize perturbation data
@@ -313,11 +308,11 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
clip_scores = par['clip_scores']
# Evaluate GRN
- print(f'Compute metrics for layer: {layer}', flush=True)
+ verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3)
# print(f'Dynamic approach:', flush=True)
- print(f'Static approach (theta=0):', flush=True)
+ verbose_print(par['verbose'], f'Static approach (theta=0):', 3)
score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores)
- print(f'Static approach (theta=0.5):', flush=True)
+ verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3)
score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores)
# print(f'Static approach (theta=1):', flush=True)
# score_static_max = static_approach(grn, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'])
@@ -328,7 +323,6 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
'static-theta-0.5': [float(score_static_median)]
# 'static-theta-1.0': [float(score_static_max)],
}
- # print(f'Scores on {layer}: {results}')
# Add dynamic score
if not par['static_only']:
@@ -339,6 +333,5 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
# Convert results to DataFrame
df_results = pd.DataFrame(results)
- df_results['Mean'] = df_results.mean(axis=1)
return df_results
diff --git a/src/metrics/regression_2/script_all.py b/src/metrics/regression_2/script_all.py
index c4aca8656..76173788e 100644
--- a/src/metrics/regression_2/script_all.py
+++ b/src/metrics/regression_2/script_all.py
@@ -3,91 +3,51 @@
import sys
import numpy as np
import os
-from tqdm import tqdm
-from sklearn.preprocessing import StandardScaler
+## VIASH START
par = {
- "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
- 'max_workers': 10,
'reg_type': 'ridge',
- 'subsample': 2,
- "tf_all": "./resources/prior/tf_all.csv",
- "temp_dir": "output",
- 'consensus': 'resources/prior/consensus-num-regulators.json',
-}
-
-def create_positive_control(X: np.ndarray, groups: np.ndarray):
- grns = []
- for group in tqdm(np.unique(groups), desc="Processing groups"):
- X_sub = X[groups == group, :]
- X_sub = StandardScaler().fit_transform(X_sub)
- grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
- grns.append(grn)
- return np.mean(grns, axis=0)
-def create_negative_control(gene_names) -> np.ndarray:
- ratio = [.98, .01, 0.01]
- n_tf = 400
- net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio)
- net = pd.DataFrame(net, index=gene_names)
- return net
+ 'read_dir': "resources/grn_models/d0_hvgs",
+ 'write_dir': "resources/results/d0_hvgs_ridge",
+ 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'],
-print('Reading input data')
-perturbation_data = ad.read_h5ad(par["perturbation_data"])
-gene_names = perturbation_data.var_names.to_numpy()
-tf_all = np.loadtxt(par['tf_all'], dtype=str)
-groups = perturbation_data.obs.cell_type
-
+ "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
+ "tf_all": "resources/prior/tf_all.csv",
+ "min_tf": False,
+ "max_n_links": 50000,
+ "apply_tf": "true",
+ 'layer': 'scgen_pearson',
+ 'subsample': -2,
+ 'num_workers': 4,
+ 'verbose': 1,
+ 'binarize': True,
+ 'num_workers': 20,
+ 'consensus': 'resources/prior/consensus-num-regulators.json',
+ 'static_only': True,
+ 'clip_scores': True
+}
+# VIASH END
meta = {
- "resources_dir":'./'
+ "resources_dir": 'src/metrics/regression_2/',
+ "util": 'src/utils'
}
sys.path.append(meta["resources_dir"])
+sys.path.append(meta["util"])
from main import main
-layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson']
-grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri']
-controls = ['negative_control', 'positive_control']
-
-os.makedirs(par['temp_dir'], exist_ok=True)
-for grn_model in controls + grn_models :
- par["score"] = f"{par['temp_dir']}/reg2-{grn_model}.csv"
- for ii, layer in enumerate(layers):
- par["layer"] = layer
- if grn_model=='positive_control':
- # print('Inferring GRN')
- # net = create_positive_control(perturbation_data.layers[par["layer"]], groups)
-
- # net = pd.DataFrame(net, index=gene_names, columns=gene_names)
- # net = net.loc[:, net.columns.isin(tf_all)]
-
- # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
-
- # pivoted_net = pivoted_net.rename(columns={'index': 'target'})
- # pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
- par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv"
- # pivoted_net.to_csv(par['prediction'])
- elif grn_model=='negative_control':
- # print('Inferring GRN')
- # net = create_negative_control(gene_names)
-
- # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
-
- # pivoted_net = pivoted_net.rename(columns={'index': 'target'})
- # pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
- par['prediction'] = f"{par['temp_dir']}/negative_control.csv"
- # pivoted_net.to_csv(par['prediction'])
- else:
- par['prediction'] = f"resources/grn_models/{grn_model}.csv"
- output = main(par)
- output.index = [layer]
- if ii == 0:
- score = output
- else:
- score = pd.concat([score, output], axis=0)
- print("Write output to file", flush=True)
- print(grn_model, layer, score)
+os.makedirs(par['write_dir'], exist_ok=True)
- print("Write output to file", flush=True)
- score.to_csv(par['score'])
+for i, method in enumerate(par['methods']):
+ par['prediction'] = f"{par['read_dir']}/{method}.csv"
+ prediction = main(par)
+ prediction.index = [method]
+ if i==0:
+ df_all = prediction
+ else:
+ df_all = pd.concat([df_all, prediction])
+ df_all.to_csv(f"{par['write_dir']}/reg2.csv")
+ print(df_all)
+
diff --git a/src/robustness_analysis/permute_grn/script.py b/src/robustness_analysis/permute_grn/script.py
index ad3eca2b4..da6664ea8 100644
--- a/src/robustness_analysis/permute_grn/script.py
+++ b/src/robustness_analysis/permute_grn/script.py
@@ -26,12 +26,6 @@
noise = np.random.normal(loc=0, scale=degree * std_dev, size=prediction['weight'].shape)
prediction['weight'] += noise
-elif type == 'links': # shuffle source-target-weight
- print('Permute links')
- num_rows_to_permute = int(len(prediction) * degree)
- permute_indices = np.random.choice(prediction.index, size=num_rows_to_permute, replace=False)
- prediction.loc[permute_indices, 'weight'] = np.random.permutation(prediction.loc[permute_indices, 'weight'].values)
-
elif type == 'net': # shuffle source-target matrix
print('Permute links')
@@ -41,7 +35,7 @@
# Fill NaNs with 0 or a value of your choice
pivot_df.fillna(0, inplace=True)
- # 2. Randomly choose 20% of the matrix to shuffle
+ # 2. Randomly choose degree% of the matrix to shuffle
matrix_flattened = pivot_df.values.flatten()
n_elements = len(matrix_flattened)
n_shuffle = int(n_elements * degree)
diff --git a/src/workflows/run_robustness_analysis/config.novsh.yaml b/src/workflows/run_robustness_analysis/config.vsh.yaml
similarity index 100%
rename from src/workflows/run_robustness_analysis/config.novsh.yaml
rename to src/workflows/run_robustness_analysis/config.vsh.yaml
diff --git a/src/workflows/run_robustness_analysis/main.nf b/src/workflows/run_robustness_analysis/main.nf
index 1b38effed..4fe38b43b 100644
--- a/src/workflows/run_robustness_analysis/main.nf
+++ b/src/workflows/run_robustness_analysis/main.nf
@@ -14,7 +14,7 @@ workflow run_wf {
// construct list of metrics
metrics = [
- // regression_1,
+ regression_1,
regression_2
]
@@ -52,6 +52,7 @@ workflow run_wf {
num_workers: "num_workers",
consensus: "consensus",
tf_all: "tf_all"
+
],
// use 'toState' to publish that component's outputs to the overall state
toState: { id, output, state, comp ->