diff --git a/batch_run.sh b/batch_run.sh index 1b2cc8ecb..7c5544804 100644 --- a/batch_run.sh +++ b/batch_run.sh @@ -13,3 +13,6 @@ # sbatch --job-name=tigress_donor0_hvg scripts/sbatch/single_omics_R.sh tigress tigress + + +# sbatch --job-name=tigress_donor0_hvg scripts/sbatch/calculate_scores.sh \ No newline at end of file diff --git a/runs.ipynb b/runs.ipynb index 5ade52ba0..cd1c593f8 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -16,6 +16,26 @@ "# collectRI.to_csv(f'resources/grn_models/default/collectri.csv')" ] }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "multiomics_atac_d0.h5ad multiomics_rna_d0.h5ad multiomics_rna.rds\n", + "multiomics_atac.h5ad\t multiomics_rna_d0_hvg.h5ad perturbation_data.h5ad\n", + "multiomics_atac.rds\t multiomics_rna.h5ad\n", + "multiomics_rna_0.h5ad\t multiomics_rna_hvg.h5ad\n" + ] + } + ], + "source": [ + "!ls resources/grn-benchmark//" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -56,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -66,6 +86,9 @@ "import numpy as np\n", "import scanpy as sc \n", "from src.exp_analysis.helper import plot_cumulative_density\n", + "import sys \n", + "sys.path.append('../')\n", + "from grn_benchmark.src.commons import surragate_names\n", "def extract_data(data, reg='reg1', dataset_id='scgen_pearson'):\n", " i = 0\n", " for entry in data:\n", @@ -120,32 +143,16 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__MACOSX\t\t\t\t shalek.h5ad\n", - "han.h5ad\t\t\t\t shalek_KDunion.csv\n", - "han_KDunion.csv\t\t\t\t shalek_chipunion.csv\n", - "han_chipunion.csv\t\t\t shalek_chipunion_KDUnion_intersect.csv\n", - "han_chipunion_KDUnion_intersect.csv\t zhao.h5ad\n", - "jackson.h5ad\t\t\t\t zhao_KDunion.csv\n", - "jackson_KDunion.csv\t\t\t zhao_chipunion.csv\n", - "jackson_chipunion.csv\t\t\t zhao_chipunion_KDUnion_intersect.csv\n", - "jackson_chipunion_KDUnion_intersect.csv\n" - ] - } - ], + "outputs": [], "source": [ - "!ls resources_local/mccalla_extended/" + "# !ls resources_local/mccalla_extended/" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -175,281 +182,322 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Scores" + "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" + "# consensus: run this after updating method list\n", + "# !python src/metrics/regression_2/consensus/script.py #inside the method names and dir\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GB" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Submitted batch job 7746169\n" + ] + } + ], + "source": [ + "!sbatch scripts/sbatch/calculate_scores.sh #includes both reg1 and 2. #inside the script, set the reg_type, read and write dirs, and methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scores" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5S1S2static-theta-0.0static-theta-0.5
collectrinannannannancollectri-0.100238-0.2111820.4893160.514896
negative_controlnannannannannegative_control-0.043795-0.0455610.4590800.505002
positive_control0.4891470.6771550.6554070.574608positive_control0.4891470.6771550.6554070.574608
pearson_corr0.2386640.5146120.5295020.524232pearson_corr0.2386640.5146120.5295020.524232
pearson_causal0.3552560.5787530.7413280.560490pearson_causal0.3552560.5787530.7413280.560490
portia0.1489410.2272480.4512560.518048portia0.1489410.2272480.4512560.518048
ppcor0.0228460.0941070.3966800.509874ppcor0.0228460.0941070.3966800.509874
genie30.3726410.4903570.7540730.576580genie30.3726410.4903570.7540730.576580
grnboost20.3810320.4598600.7818520.609075grnboost20.3810320.4598600.7818520.609075
scenic0.1543990.2065710.6008390.574294scenic0.1475530.2146940.6008390.574294
scglue0.0783090.2388590.4486170.527076scglue0.0783090.2388590.4486170.527076
celloracle0.2168970.3114510.6395560.580147celloracle0.2168970.3114510.6395560.580147
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 33, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "RUN_ID=\"d0_hvgs\" \n", - "df_scores = process_data(RUN_ID, models_all=methods)\n", + "df_scores = pd.read_csv('resources/results/scores/ridge.csv', index_col=0)\n", "df_scores.style.background_gradient()" ] }, @@ -460,26 +508,33 @@ "## Format resourcs used" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### local runs" + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", - "/vol/tmp/users/jnourisa/ipykernel_1653246/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2555768217.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n" ] }, @@ -596,15 +651,15 @@ "scglue 35.933720 " ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# extract trace for local jobs\n", + "# extract trace_seqerafor local jobs\n", "job_ids = {\n", - " 'portia': 7742241,\n", + " 'portia': 7744548,\n", " 'grnboost2': 7742249,\n", " 'scenic': 7742283,\n", " 'genie3': 7742285,\n", @@ -646,11 +701,28 @@ "df_local" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### sequra runs: multiple runs" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/vol/tmp/users/jnourisa/ipykernel_1858346/2779943256.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " trace = trace.groupby('model').apply(lambda df: df.sort_values(by='duration', ascending=False).iloc[0])[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n", + "/vol/tmp/users/jnourisa/ipykernel_1858346/2779943256.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " trace = trace.groupby('model').apply(lambda df: df.sort_values(by='duration', ascending=False).iloc[0])[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n" + ] + }, { "data": { "text/html": [ @@ -679,15 +751,51 @@ " wchar\n", " duration\n", " \n", + " \n", + " model\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", + " pearson_causal\n", + " 515.0\n", + " 0.9747\n", + " 4.0\n", + " 0.0659\n", + " 0.0014\n", + " 0.064167\n", + " \n", + " \n", + " pearson_corr\n", + " 528.2\n", + " 0.9751\n", + " 4.0\n", + " 0.0659\n", + " 0.0014\n", + " 0.072500\n", + " \n", + " \n", + " positive_control\n", + " 200.1\n", + " 4.9000\n", + " 8.0\n", + " 2.3000\n", + " 0.0018\n", + " 0.075278\n", + " \n", + " \n", " celloracle\n", " 799.6\n", - " 14.9\n", + " 14.9000\n", " 35.4\n", - " 18.0\n", - " 86.1\n", + " 18.0000\n", + " 86.1000\n", " 1.472222\n", " \n", " \n", @@ -695,63 +803,84 @@ "" ], "text/plain": [ - " %cpu peak_rss peak_vmem rchar wchar duration\n", - "celloracle 799.6 14.9 35.4 18.0 86.1 1.472222" + " %cpu peak_rss peak_vmem rchar wchar duration\n", + "model \n", + "pearson_causal 515.0 0.9747 4.0 0.0659 0.0014 0.064167\n", + "pearson_corr 528.2 0.9751 4.0 0.0659 0.0014 0.072500\n", + "positive_control 200.1 4.9000 8.0 2.3000 0.0018 0.075278\n", + "celloracle 799.6 14.9000 35.4 18.0000 86.1000 1.472222" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# sequra runs\n", - "base_dir = 'resources/results/d0_hvgs_res/'\n", - "models = ['celloracle']\n", - "\n", - "def convert_duration_to_hours(duration_str):\n", - " import re\n", - " hours, minutes, seconds = 0, 0, 0\n", - " time_parts = re.findall(r'(\\d+)([hms])', duration_str)\n", - " for value, unit in time_parts:\n", - " if unit == 'h':\n", - " hours = int(value)\n", - " elif unit == 'm':\n", - " minutes = int(value)\n", - " elif unit == 's':\n", - " seconds = int(value)\n", - " return (hours * 3600 + minutes * 60 + seconds)/3600\n", - "def format_gb(gb_str):\n", - " gb_value = float(gb_str.split()[0])\n", - " return gb_value \n", "\n", + "def process_trace(trace):\n", + " trace['model'] = pd.DataFrame(trace.name.str.split(':').to_list())[3] #TODO: number 3 might be different \n", + " trace = trace[trace['model'].isin(models)]\n", + " trace = trace.groupby('model').apply(lambda df: df.sort_values(by='duration', ascending=False).iloc[0])[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n", "\n", - "for i, method in enumerate(models):\n", - " df = pd.read_csv(f'{base_dir}/{method}.txt', sep='\\t')\n", - " df = df[['%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'duration']]\n", - " df = df.drop(1)\n", "\n", + " def convert_duration_to_hours(duration_str):\n", + " import re\n", + " hours, minutes, seconds = 0, 0, 0\n", + " time_parts = re.findall(r'(\\d+)([hms])', duration_str)\n", + " for value, unit in time_parts:\n", + " if unit == 'h':\n", + " hours = int(value)\n", + " elif unit == 'm':\n", + " minutes = int(value)\n", + " elif unit == 's':\n", + " seconds = int(value)\n", + " return (hours * 3600 + minutes * 60 + seconds)/3600\n", + " def format_ram(row):\n", + " value = float(row.split()[0])\n", + " if 'GB' in row:\n", + " value = value\n", + " elif 'MB' in row:\n", + " value = value/1000\n", + " else:\n", + " raise ValueError('Define')\n", + " return value \n", "\n", - " for col in df.columns:\n", + " for col in trace.columns:\n", " if col=='%cpu':\n", - " df[col] = df[col].str.replace('%', '').astype(float)\n", + " trace[col] = trace[col].str.replace('%', '').astype(float)\n", " elif col=='duration':\n", - " df[col] = df[col].apply(convert_duration_to_hours)\n", + " trace[col] = trace[col].apply(convert_duration_to_hours)\n", " else:\n", - " df[col] = df[col].apply(format_gb)\n", - " df = df.sort_values(by='duration', ascending=False).iloc[[0]]\n", - " \n", - " df.index = [method]\n", - " if i==0:\n", - " df_sequera = df \n", - " else:\n", - " df_sequera = pd.concat([df_sequera, df], axis=0)\n", - "df_sequera" + " trace[col] = trace[col].apply(format_ram)\n", + " return trace \n", + "\n", + "models = ['pearson_causal', 'pearson_corr', 'positive_control']\n", + "base_dir = 'resources/results/d0_hvgs_baseline'\n", + "trace = pd.read_csv(f'{base_dir}/trace.txt', sep='\\t')\n", + "\n", + "trace_baselines = process_trace(trace)\n", + "\n", + "models = ['celloracle']\n", + "base_dir = 'resources/results/celloracle_d0_hvgs'\n", + "trace = pd.read_csv(f'{base_dir}/trace.txt', sep='\\t')\n", + "trace_models = process_trace(trace)\n", + "\n", + "\n", + "trace_seqera = pd.concat([trace_baselines, trace_models], axis=0)\n", + "trace_seqera" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### merge local and cluster dfs " ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -781,6 +910,21 @@ " \n", " \n", " \n", + " pearson_causal\n", + " 0.974700\n", + " 0.064167\n", + " \n", + " \n", + " pearson_corr\n", + " 0.975100\n", + " 0.072500\n", + " \n", + " \n", + " positive_control\n", + " 4.900000\n", + " 0.075278\n", + " \n", + " \n", " celloracle\n", " 14.900000\n", " 1.472222\n", @@ -820,17 +964,20 @@ "" ], "text/plain": [ - " Peak memory (GB) Duration (hour)\n", - "celloracle 14.900000 1.472222\n", - "portia 46.943497 0.110556\n", - "grnboost2 3.067471 1.568056\n", - "scenic 30.356461 1.908056\n", - "genie3 13.105103 16.682500\n", - "ppcor 3.909119 0.556667\n", - "scglue 29.917423 4.380278" + " Peak memory (GB) Duration (hour)\n", + "pearson_causal 0.974700 0.064167\n", + "pearson_corr 0.975100 0.072500\n", + "positive_control 4.900000 0.075278\n", + "celloracle 14.900000 1.472222\n", + "portia 46.943497 0.110556\n", + "grnboost2 3.067471 1.568056\n", + "scenic 30.356461 1.908056\n", + "genie3 13.105103 16.682500\n", + "ppcor 3.909119 0.556667\n", + "scglue 29.917423 4.380278" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -840,12 +987,12 @@ "map_dict = {'peak_rss':'MaxRSS', 'duration':'Elapsed'}\n", "\n", "df_local = df_local[map_dict.values()]\n", - "df_sequera = df_sequera[map_dict.keys()]\n", + "trace_seqera = trace_seqera[map_dict.keys()]\n", "\n", - "df_sequera.columns = df_sequera.columns.map(map_dict)\n", + "trace_seqera.columns = trace_seqera.columns.map(map_dict)\n", "\n", "\n", - "df_res = pd.concat([df_sequera, df_local], axis=0)\n", + "df_res = pd.concat([trace_seqera, df_local], axis=0)\n", "\n", "df_res.columns = ['Peak memory (GB)', 'Duration (hour)']\n", "\n", @@ -861,7 +1008,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -885,145 +1032,184 @@ " \n", " \n", " \n", + " method_name\n", " ex(False)_tf(-1)\n", " ex(True)_tf(-1)\n", " static-theta-0.0\n", " static-theta-0.5\n", " overall_score\n", + " Peak memory (GB)\n", + " Duration (hour)\n", " \n", " \n", " \n", " \n", - " collectri\n", - " 0.000000\n", + " 0\n", + " collectri\n", " 0.000000\n", " 0.000000\n", + " 0.240505\n", + " 0.095068\n", + " 0.077646\n", " 0.000000\n", " 0.000000\n", " \n", " \n", - " negative_control\n", + " 1\n", + " negative_control\n", " 0.000000\n", " 0.000000\n", + " 0.162005\n", " 0.000000\n", + " 0.032401\n", " 0.000000\n", " 0.000000\n", " \n", " \n", - " positive_control\n", + " 2\n", + " positive_control\n", " 1.000000\n", " 1.000000\n", - " 0.838274\n", - " 0.943410\n", - " 0.945421\n", + " 0.671716\n", + " 0.668815\n", + " 0.860974\n", + " 4.900000\n", + " 0.075278\n", " \n", " \n", - " pearson_corr\n", + " 3\n", + " pearson_corr\n", " 0.487919\n", " 0.759962\n", - " 0.677241\n", - " 0.860702\n", - " 0.696456\n", + " 0.344838\n", + " 0.184772\n", + " 0.453523\n", + " 0.975100\n", + " 0.072500\n", " \n", " \n", - " pearson_causal\n", + " 4\n", + " pearson_causal\n", " 0.726277\n", " 0.854683\n", - " 0.948169\n", - " 0.920231\n", - " 0.862340\n", + " 0.894789\n", + " 0.533161\n", + " 0.774527\n", + " 0.974700\n", + " 0.064167\n", " \n", " \n", - " portia\n", + " 5\n", + " portia\n", " 0.304491\n", " 0.335592\n", - " 0.577163\n", - " 0.850549\n", - " 0.516949\n", + " 0.141693\n", + " 0.125352\n", + " 0.226639\n", + " 46.943497\n", + " 0.110556\n", " \n", " \n", - " ppcor\n", + " 6\n", + " ppcor\n", " 0.046705\n", " 0.138974\n", - " 0.507359\n", - " 0.837129\n", - " 0.382542\n", + " 0.000000\n", + " 0.046814\n", + " 0.050776\n", + " 3.909119\n", + " 0.556667\n", " \n", " \n", - " genie3\n", + " 7\n", + " genie3\n", " 0.761818\n", " 0.724142\n", - " 0.964470\n", - " 0.946649\n", - " 0.849270\n", + " 0.927879\n", + " 0.687766\n", + " 0.798691\n", + " 13.105103\n", + " 16.682500\n", " \n", " \n", - " grnboost2\n", + " 8\n", + " grnboost2\n", " 0.778974\n", " 0.679105\n", " 1.000000\n", " 1.000000\n", - " 0.864520\n", + " 0.891616\n", + " 3.067471\n", + " 1.568056\n", " \n", " \n", - " scenic\n", - " 0.315650\n", + " 9\n", + " scenic\n", + " 0.295812\n", " 0.305056\n", - " 0.768481\n", - " 0.942895\n", - " 0.583021\n", + " 0.530045\n", + " 0.665799\n", + " 0.458534\n", + " 30.356461\n", + " 1.908056\n", " \n", " \n", - " scglue\n", + " 10\n", + " scglue\n", " 0.160093\n", " 0.352738\n", - " 0.573787\n", - " 0.865371\n", - " 0.487997\n", + " 0.134840\n", + " 0.212101\n", + " 0.214294\n", + " 29.917423\n", + " 4.380278\n", " \n", " \n", - " celloracle\n", + " 11\n", + " celloracle\n", " 0.443420\n", " 0.459941\n", - " 0.818001\n", - " 0.952505\n", - " 0.668467\n", + " 0.630565\n", + " 0.722041\n", + " 0.578251\n", + " 14.900000\n", + " 1.472222\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ex(False)_tf(-1) ex(True)_tf(-1) static-theta-0.0 \\\n", - "collectri 0.000000 0.000000 0.000000 \n", - "negative_control 0.000000 0.000000 0.000000 \n", - "positive_control 1.000000 1.000000 0.838274 \n", - "pearson_corr 0.487919 0.759962 0.677241 \n", - "pearson_causal 0.726277 0.854683 0.948169 \n", - "portia 0.304491 0.335592 0.577163 \n", - "ppcor 0.046705 0.138974 0.507359 \n", - "genie3 0.761818 0.724142 0.964470 \n", - "grnboost2 0.778974 0.679105 1.000000 \n", - "scenic 0.315650 0.305056 0.768481 \n", - "scglue 0.160093 0.352738 0.573787 \n", - "celloracle 0.443420 0.459941 0.818001 \n", + " method_name ex(False)_tf(-1) ex(True)_tf(-1) static-theta-0.0 \\\n", + "0 collectri 0.000000 0.000000 0.240505 \n", + "1 negative_control 0.000000 0.000000 0.162005 \n", + "2 positive_control 1.000000 1.000000 0.671716 \n", + "3 pearson_corr 0.487919 0.759962 0.344838 \n", + "4 pearson_causal 0.726277 0.854683 0.894789 \n", + "5 portia 0.304491 0.335592 0.141693 \n", + "6 ppcor 0.046705 0.138974 0.000000 \n", + "7 genie3 0.761818 0.724142 0.927879 \n", + "8 grnboost2 0.778974 0.679105 1.000000 \n", + "9 scenic 0.295812 0.305056 0.530045 \n", + "10 scglue 0.160093 0.352738 0.134840 \n", + "11 celloracle 0.443420 0.459941 0.630565 \n", "\n", - " static-theta-0.5 overall_score \n", - "collectri 0.000000 0.000000 \n", - "negative_control 0.000000 0.000000 \n", - "positive_control 0.943410 0.945421 \n", - "pearson_corr 0.860702 0.696456 \n", - "pearson_causal 0.920231 0.862340 \n", - "portia 0.850549 0.516949 \n", - "ppcor 0.837129 0.382542 \n", - "genie3 0.946649 0.849270 \n", - "grnboost2 1.000000 0.864520 \n", - "scenic 0.942895 0.583021 \n", - "scglue 0.865371 0.487997 \n", - "celloracle 0.952505 0.668467 " + " static-theta-0.5 overall_score Peak memory (GB) Duration (hour) \n", + "0 0.095068 0.077646 0.000000 0.000000 \n", + "1 0.000000 0.032401 0.000000 0.000000 \n", + "2 0.668815 0.860974 4.900000 0.075278 \n", + "3 0.184772 0.453523 0.975100 0.072500 \n", + "4 0.533161 0.774527 0.974700 0.064167 \n", + "5 0.125352 0.226639 46.943497 0.110556 \n", + "6 0.046814 0.050776 3.909119 0.556667 \n", + "7 0.687766 0.798691 13.105103 16.682500 \n", + "8 1.000000 0.891616 3.067471 1.568056 \n", + "9 0.665799 0.458534 30.356461 1.908056 \n", + "10 0.212101 0.214294 29.917423 4.380278 \n", + "11 0.722041 0.578251 14.900000 1.472222 " ] }, - "execution_count": 34, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1036,269 +1222,24 @@ "df_scores['overall_score'] = df_scores.mean(axis=1)\n", "# df_scores['rank'] = df_scores.mean(axis=1).rank(ascending=False).astype(int)\n", "\n", - "df_scores" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Peak memory (GB)Duration (hour)ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5overall_score
collectri0.0000000.0000000.0000000.0000000.0000000.0000000.000000
negative_control0.0000000.0000000.0000000.0000000.0000000.0000000.000000
positive_control0.0000000.0000001.0000001.0000000.8382740.9434100.945421
pearson_corr0.0000000.0000000.4879190.7599620.6772410.8607020.696456
pearson_causal0.0000000.0000000.7262770.8546830.9481690.9202310.862340
portia46.9434970.1105560.3044910.3355920.5771630.8505490.516949
ppcor3.9091190.5566670.0467050.1389740.5073590.8371290.382542
genie313.10510316.6825000.7618180.7241420.9644700.9466490.849270
grnboost23.0674711.5680560.7789740.6791051.0000001.0000000.864520
scenic30.3564611.9080560.3156500.3050560.7684810.9428950.583021
scglue29.9174234.3802780.1600930.3527380.5737870.8653710.487997
celloracle14.9000001.4722220.4434200.4599410.8180010.9525050.668467
\n", - "
" - ], - "text/plain": [ - " Peak memory (GB) Duration (hour) ex(False)_tf(-1) \\\n", - "collectri 0.000000 0.000000 0.000000 \n", - "negative_control 0.000000 0.000000 0.000000 \n", - "positive_control 0.000000 0.000000 1.000000 \n", - "pearson_corr 0.000000 0.000000 0.487919 \n", - "pearson_causal 0.000000 0.000000 0.726277 \n", - "portia 46.943497 0.110556 0.304491 \n", - "ppcor 3.909119 0.556667 0.046705 \n", - "genie3 13.105103 16.682500 0.761818 \n", - "grnboost2 3.067471 1.568056 0.778974 \n", - "scenic 30.356461 1.908056 0.315650 \n", - "scglue 29.917423 4.380278 0.160093 \n", - "celloracle 14.900000 1.472222 0.443420 \n", - "\n", - " ex(True)_tf(-1) static-theta-0.0 static-theta-0.5 \\\n", - "collectri 0.000000 0.000000 0.000000 \n", - "negative_control 0.000000 0.000000 0.000000 \n", - "positive_control 1.000000 0.838274 0.943410 \n", - "pearson_corr 0.759962 0.677241 0.860702 \n", - "pearson_causal 0.854683 0.948169 0.920231 \n", - "portia 0.335592 0.577163 0.850549 \n", - "ppcor 0.138974 0.507359 0.837129 \n", - "genie3 0.724142 0.964470 0.946649 \n", - "grnboost2 0.679105 1.000000 1.000000 \n", - "scenic 0.305056 0.768481 0.942895 \n", - "scglue 0.352738 0.573787 0.865371 \n", - "celloracle 0.459941 0.818001 0.952505 \n", - "\n", - " overall_score \n", - "collectri 0.000000 \n", - "negative_control 0.000000 \n", - "positive_control 0.945421 \n", - "pearson_corr 0.696456 \n", - "pearson_causal 0.862340 \n", - "portia 0.516949 \n", - "ppcor 0.382542 \n", - "genie3 0.849270 \n", - "grnboost2 0.864520 \n", - "scenic 0.583021 \n", - "scglue 0.487997 \n", - "celloracle 0.668467 " - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_combined = pd.concat([df_res, df_scores], axis=1).fillna(0)\n", - "df_combined = df_combined.reindex(methods)\n", - "df_combined" + "df_all = pd.concat([df_scores, df_res], axis=1)\n", + "df_all = df_all.fillna(0)\n", + "df_all.index.name = 'method_name' \n", + "df_all = df_all.reset_index()\n", + "\n", + "df_all \n" ] }, { - "cell_type": "code", - "execution_count": 36, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log\n", - " result = getattr(ufunc, method)(*inputs, **kwargs)\n", - "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log\n", - " result = getattr(ufunc, method)(*inputs, **kwargs)\n" - ] - } - ], "source": [ - "summary_all_file = 'output/summary_d0_hvgs.tsv'\n", - "summary_plot = 'output/summary_d0_hvgs.pdf'\n", - "df_all = df_combined.copy()\n", - "# df_combined.index = df_combined.index.map(surragate_names)\n", - "\n", - "\n", - "df_all.index.name = 'method_name' \n", - "# df_all['mean_cpu_pct_scaled'] = df_all['%CPU']/df_all['%CPU'].max()\n", - "df_all['mean_peak_memory_log_scaled'] = np.log(df_all['Peak memory (GB)'])\n", - "# df_all['mean_peak_memory_str'] = [f\"{int(value/ 1E9)}\" for value in df_all['Peak memory']]\n", - "df_all['mean_peak_memory_str'] = df_all['Peak memory (GB)']\n", - "# df_all[\"mean_disk_read_log_scaled\"] = np.log(df_all['Disk read'])\n", - "# df_all[\"mean_disk_read_str\"] = [f\"{int(value/ 1E9)}\" for value in df_all['Disk read']]\n", - "# df_all[\"mean_disk_write_log_scaled\"] = np.log(df_all['Disk write'])\n", - "# df_all[\"mean_disk_write_str\"] = [f\"{int(value/ 1E9)}\" for value in df_all['Disk write']]\n", - "df_all[\"mean_duration_log_scaled\"] = np.log(df_all['Duration (hour)'])\n", - "# df_all[\"mean_duration_str\"] = [f\"{int(value/ (60*60))}\" for value in df_all['Duration']]\n", - "df_all[\"mean_duration_str\"] = df_all['Duration (hour)']\n", - "\n", - "\n", - "df_all = df_all.reset_index()\n", - "df_all.to_csv(summary_all_file, sep='\\t')\n", - "\n" + "## Summary figure" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -1306,23 +1247,68 @@ "output_type": "stream", "text": [ "── \u001b[1mAttaching packages\u001b[22m ─────────────────────────────────────── tidyverse 1.3.1 ──\n", - "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.3.6 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4\n", - "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.1.7 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.0.9\n", + "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.5.1 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4\n", + "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.2.1 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4\n", "\u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.2.0 \u001b[32m✔\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n", "\u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.2 \u001b[32m✔\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n", "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────── tidyverse_conflicts() ──\n", "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n", "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n", - "\u001b[?25hError in library(funkyheatmap) : \n", - " there is no package called ‘funkyheatmap’\n", - "Execution halted\n", - "\u001b[?25h" + "\u001b[?25h\u001b[?25h\u001b[?25hWarning message:\n", + "\u001b[1m\u001b[22m`thisfile()` was deprecated in rprojroot 2.0.0.\n", + "\u001b[36mℹ\u001b[39m Please use `whereami::thisfile()` instead. \n", + "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[1m\u001b[22mNew names:\n", + "\u001b[36m•\u001b[39m `` -> `...1`\n", + "\u001b[1mRows: \u001b[22m\u001b[34m12\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m13\u001b[39m\n", + "\u001b[36m──\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m────────────────────────────────────────────────────────\u001b[39m\n", + "\u001b[1mDelimiter:\u001b[22m \"\\t\"\n", + "\u001b[31mchr\u001b[39m (1): method_name\n", + "\u001b[32mdbl\u001b[39m (12): ...1, ex(False)_tf(-1), ex(True)_tf(-1), static-theta-0.0, static-...\n", + "\n", + "\u001b[36mℹ\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n", + "\u001b[36mℹ\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n", + "\u001b[?25h\u001b[?25h\u001b[90m# A tibble: 10 × 7\u001b[39m\n", + " id id_color name group geom palette options \n", + " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \n", + "\u001b[90m 1\u001b[39m method_name \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mName\u001b[90m\"\u001b[39m meth… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", + "\u001b[90m 2\u001b[39m overall_score overall_score \u001b[90m\"\u001b[39mScore\u001b[90m\"\u001b[39m over… bar overall \u001b[90m\u001b[39m\n", + "\u001b[90m 3\u001b[39m ex(False)_tf(-1) ex(False)_tf(-1) \u001b[90m\"\u001b[39mS11\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n", + "\u001b[90m 4\u001b[39m ex(True)_tf(-1) ex(True)_tf(-1) \u001b[90m\"\u001b[39mS12\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n", + "\u001b[90m 5\u001b[39m static-theta-0.0 static-theta-0.0 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", + "\u001b[90m 6\u001b[39m static-theta-0.5 static-theta-0.5 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", + "\u001b[90m 7\u001b[39m memory_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mPeak mem… reso… rect resour… \u001b[90m\u001b[39m\n", + "\u001b[90m 8\u001b[39m memory_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", + "\u001b[90m 9\u001b[39m duration_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mDuration… reso… rect resour… \u001b[90m\u001b[39m\n", + "\u001b[90m10\u001b[39m duration_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", + "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[36mℹ\u001b[39m Could not find column 'id' in data. Using rownames as 'id'.\n", + "\u001b[36mℹ\u001b[39m Column info did not contain a column called 'legend', generating options based on the 'geom' column.\n", + "\u001b[36mℹ\u001b[39m No row info was provided, assuming all rows in `data` are to be plotted.\n", + "\u001b[36mℹ\u001b[39m Row info did not contain group information, assuming rows are ungrouped.\n", + "\u001b[36mℹ\u001b[39m Legend 1 did not contain color, inferring from the palette.\n", + "\u001b[36mℹ\u001b[39m Legend 2 did not contain color, inferring from the palette.\n", + "\u001b[?25h\u001b[?25h\u001b[?25h" ] } ], "source": [ "\n", - "!Rscript ../grn_benchmark/src/summary_figure.R {summary_all_file} {summary_plot}\n" + "summary_file = \"output/summary_d0_hvg.tsv\"\n", + "summary_figure = \"output/summary_d0_hvg_figure.pdf\"\n", + "\n", + "df_all['memory_log'] = np.log(df_all['Peak memory (GB)']+1)\n", + "df_all['memory_log'] = np.max(df_all['memory_log'])-df_all['memory_log']\n", + "\n", + "\n", + "df_all[\"duration_log\"] = np.log(df_all['Duration (hour)']+1)\n", + "df_all['duration_log'] = np.max(df_all['duration_log'])-df_all['duration_log']\n", + "\n", + "df_all[\"duration_str\"] = df_all['Duration (hour)'].round(1).astype(str)\n", + "df_all['memory_str'] = df_all['Peak memory (GB)'].round(1).astype(str)\n", + "\n", + "\n", + "df_all.to_csv(summary_file, sep='\\t')\n", + "\n", + "!Rscript ../grn_benchmark/src/metrics_figure.R {summary_file} {summary_figure}\n" ] }, { @@ -1334,27 +1320,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ - "if par['metacell']:\n", - " print('metacell')\n", - " def create_meta_cells(df, n_cells=15):\n", - " meta_x = []\n", - " for i in range(0, df.shape[0], n_cells):\n", - " meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n", - " df = pd.DataFrame(meta_x, columns=df.columns)\n", - " return df\n", + "# if par['metacell']:\n", + "# print('metacell')\n", + "# def create_meta_cells(df, n_cells=15):\n", + "# meta_x = []\n", + "# for i in range(0, df.shape[0], n_cells):\n", + "# meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n", + "# df = pd.DataFrame(meta_x, columns=df.columns)\n", + "# return df\n", " \n", - " adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n", - " adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n", - " adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n", - " df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n", - " X = df.values\n", - " var = pd.DataFrame(index=df.columns)\n", - " obs = df.reset_index()[['cell_type','donor_id']]\n", - " multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)" + "# adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n", + "# adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n", + "# adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n", + "# df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n", + "# X = df.values\n", + "# var = pd.DataFrame(index=df.columns)\n", + "# obs = df.reset_index()[['cell_type','donor_id']]\n", + "# multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)" ] }, { @@ -1392,7 +1378,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Shuffle net" + "## Permute net" ] }, { diff --git a/scripts/run_benchmark_all.sh b/scripts/run_benchmark_all.sh index 86fb2fbc3..b27f63522 100644 --- a/scripts/run_benchmark_all.sh +++ b/scripts/run_benchmark_all.sh @@ -23,8 +23,8 @@ param_list: metric_ids: $metric_ids method_ids: $method_ids perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad - multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_0_hvgs.h5ad - multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac_0.h5ad + multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_d0_hvg.h5ad + multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac_d0.h5ad reg_type: $reg_type subsample: $subsample num_workers: $num_workers diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh index 878afe48c..b1bed3e56 100644 --- a/scripts/run_grn_evaluation.sh +++ b/scripts/run_grn_evaluation.sh @@ -47,7 +47,7 @@ grn_names=( "collectri" ) - +echo $grn_names # Start writing to the YAML file cat > $param_file << HERE param_list: diff --git a/scripts/run_robust_analys.sh b/scripts/run_robust_analys.sh index f8348c950..285dde911 100644 --- a/scripts/run_robust_analys.sh +++ b/scripts/run_robust_analys.sh @@ -1,18 +1,16 @@ #!/bin/bash -# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" - degrees=(0 10 20 50 100) noise_type="$1" #"net", "weight", "sign" echo $noise_type -RUN_ID="robust_analy_reg2_$1" +RUN_ID="robust_analy_$1" # resources_dir="resources" resources_dir="s3://openproblems-data/resources/grn" publish_dir="${resources_dir}/results/${RUN_ID}" -grn_models_folder="${resources_dir}/grn_models" +grn_models_folder="${resources_dir}/grn_models/d0_hvgs" reg_type=ridge @@ -22,12 +20,18 @@ max_workers=10 param_file="./params/${RUN_ID}.yaml" grn_names=( - "collectri" - "celloracle" - "scenicplus" - "figr" - "granie" - "scglue" + 'collectri' + 'negative_control' + 'positive_control' + 'pearson_corr' + 'pearson_causal' + 'portia' + 'ppcor' + 'genie3' + 'grnboost2' + 'scenic' + 'scglue' + 'celloracle' ) @@ -36,6 +40,7 @@ cat > $param_file << HERE param_list: HERE + append_entry() { cat >> $param_file << HERE - id: ${1}_${2}_${3} diff --git a/scripts/sbatch/calculate_scores.sh b/scripts/sbatch/calculate_scores.sh new file mode 100644 index 000000000..774d9fcab --- /dev/null +++ b/scripts/sbatch/calculate_scores.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --job-name=calculate-scores +#SBATCH --time=48:00:00 +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --mail-type=END +#SBATCH --mail-user=jalil.nourisa@gmail.com +#SBATCH --mem=64G +#SBATCH --cpus-per-task=20 + +python src/metrics/regression_1/script_all.py diff --git a/src/methods/single_omics/portia/script.py b/src/methods/single_omics/portia/script.py index 6bdeeb435..d8cede44c 100644 --- a/src/methods/single_omics/portia/script.py +++ b/src/methods/single_omics/portia/script.py @@ -44,7 +44,7 @@ def infer_grn(X, gene_names): dataset.add(pt.Experiment(exp_id, data)) M_bar = pt.run(dataset, method='no-transform') - limit = min([10000000, X.size]) + limit = min([500000, X.size]) ranked_scores = pt.rank_scores(M_bar, gene_names, limit=limit) sources, targets, weights = zip(*[(gene_a, gene_b, score) for gene_a, gene_b, score in ranked_scores]) diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index 6a1da25d4..9886bbc65 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -11,6 +11,10 @@ from tqdm import tqdm from util import verbose_print, process_links, verbose_tqdm import os +import warnings + +warnings.filterwarnings("ignore", category=FutureWarning) + class lightgbm_wrapper: @@ -54,14 +58,6 @@ def cv_5(genes_n): def cross_validation(net, prturb_adata, par:dict): gene_names = prturb_adata.var_names net = process_net(net.copy(), gene_names) - net_subset = net.copy() - # Subset TFs - if par['tf_n'] == -1: - degrees = net.abs().sum(axis=0) - net = net.loc[:, degrees >= degrees.quantile((1 - par['theta']))] - else: - degrees = net.abs().sum(axis=0) - net = net[degrees.nlargest(tf_n).index] gene_names_grn = net.index.to_numpy() @@ -105,11 +101,11 @@ def cross_validation(net, prturb_adata, par:dict): n_estimators=100, min_samples_leaf=2, min_child_samples=1, feature_fraction=0.05, verbosity=-1 ) - regr = lightgbm_wrapper(params, max_workers=max_workers) + regr = lightgbm_wrapper(params, max_workers=par['num_workers']) elif reg_type=='RF': params = dict(boosting_type='rf',random_state=32, n_estimators=100, feature_fraction=0.05, verbosity=-1) - regr = lightgbm_wrapper(params, max_workers) + regr = lightgbm_wrapper(params, max_workers=par['num_workers']) else: raise ValueError(f"{reg_type} is not defined.") @@ -246,29 +242,11 @@ def main(par): verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) - tfs_cases = [-1] - if par['min_tf']: - tfs_cases += 30 - layer_results = {} # Store results for this layer - for exclude_missing_genes in [False, True]: # two settings on target gene - for tf_n in tfs_cases: # two settings on tfs - par['exclude_missing_genes'] = exclude_missing_genes - par['tf_n'] = tf_n - run_key = f'ex({exclude_missing_genes})_tf({tf_n})' - verbose_print(par['verbose'], f'Run for metric: {run_key}', 3) - - output = regression_1(net, perturbation_data, par=par, max_workers=max_workers) - - layer_results[run_key] = [output['mean_score_r2']] + results = {} + par['exclude_missing_genes'] = False + results['S1'] = [regression_1(net, perturbation_data, par=par, max_workers=max_workers)['mean_score_r2']] + par['exclude_missing_genes'] = True + results['S2'] = [regression_1(net, perturbation_data, par=par, max_workers=max_workers)['mean_score_r2']] - # Convert results to DataFrame - df_results = pd.DataFrame(layer_results) - # if par['min_tf']: - # if 'ex(True)_tf(140)' not in df_results.columns: - # df_results['ex(True)_tf(140)'] = df_results['ex(True)_tf(-1)'] - # if 'ex(False)_tf(140)' not in df_results.columns: - # df_results['ex(False)_tf(140)'] = df_results['ex(False)_tf(-1)'] - - df_results['Mean'] = df_results.mean(axis=1) - + df_results = pd.DataFrame(results) return df_results \ No newline at end of file diff --git a/src/metrics/regression_1/script_all.py b/src/metrics/regression_1/script_all.py index 58e1d499b..d25c5faa1 100644 --- a/src/metrics/regression_1/script_all.py +++ b/src/metrics/regression_1/script_all.py @@ -3,95 +3,53 @@ import sys import numpy as np import os -from tqdm import tqdm -from sklearn.preprocessing import StandardScaler +## VIASH START par = { - "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", - 'max_workers': 40, 'reg_type': 'ridge', + 'read_dir': "resources/grn_models/d0_hvgs", + 'write_dir': "resources/results/scores", + 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'], + 'layers': ['lognorm', 'pearson', 'scgen_lognorm', 'scgen_pearson'], + + "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", + "tf_all": "resources/prior/tf_all.csv", + "min_tf": False, + "max_n_links": 50000, + "apply_tf": "true", 'subsample': -2, - "tf_all": "./resources/prior/tf_all.csv", - "temp_dir": "output/ridge/noised" + 'num_workers': 4, + 'verbose': 1, + 'binarize': True, + 'num_workers': 20, + 'consensus': 'resources/prior/consensus-num-regulators.json', + 'static_only': True, + 'clip_scores': True } -grn_models_folder = 'resources/grn_models' -grn_models_folder = 'resources/supplementary/grn_models_noised' - -def create_positive_control(X: np.ndarray, groups: np.ndarray): - grns = [] - for group in tqdm(np.unique(groups), desc="Processing groups"): - X_sub = X[groups == group, :] - X_sub = StandardScaler().fit_transform(X_sub) - grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0] - grns.append(grn) - return np.mean(grns, axis=0) -def create_negative_control(gene_names) -> np.ndarray: - ratio = [.98, .01, 0.01] - n_tf = 400 - net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio) - net = pd.DataFrame(net, index=gene_names) - return net - - -print('Reading input data') -perturbation_data = ad.read_h5ad(par["perturbation_data"]) -gene_names = perturbation_data.var_names.to_numpy() -tf_all = np.loadtxt(par['tf_all'], dtype=str) -groups = perturbation_data.obs.cell_type - +# VIASH END meta = { - "resources_dir":'./' + "resources_dir": 'src/metrics/', + "util": 'src/utils' } sys.path.append(meta["resources_dir"]) -from main import main -# layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson'] -layers = ['pearson'] -grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri'] -# controls = ['negative_control', 'positive_control'] -controls = [] - -os.makedirs(par['temp_dir'], exist_ok=True) -for grn_model in controls + grn_models : - par["score"] = f"{par['temp_dir']}/reg1-{grn_model}.csv" - for ii, layer in enumerate(layers): - par["layer"] = layer - if grn_model=='positive_control': - print('Inferring positive control') - net = create_positive_control(perturbation_data.layers[par["layer"]], groups) - - net = pd.DataFrame(net, index=gene_names, columns=gene_names) - net = net.loc[:, net.columns.isin(tf_all)] - - pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight') - - pivoted_net = pivoted_net.rename(columns={'index': 'target'}) - pivoted_net = pivoted_net[pivoted_net['weight'] != 0] - par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv" - print(par['prediction']) - pivoted_net.to_csv(par['prediction']) - elif grn_model=='negative_control': - print('Inferring negative control') - net = create_negative_control(gene_names) - - pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight') - - pivoted_net = pivoted_net.rename(columns={'index': 'target'}) - pivoted_net = pivoted_net[pivoted_net['weight'] != 0] - par['prediction'] = f"{par['temp_dir']}/negative_control.csv" - pivoted_net.to_csv(par['prediction']) +sys.path.append(meta["util"]) + +os.makedirs(par['write_dir'], exist_ok=True) + +for layer in par['layers']: + par['layer'] = layer + for i, method in enumerate(par['methods']): + par['prediction'] = f"{par['read_dir']}/{method}.csv" + from regression_1.main import main + reg1 = main(par) + from regression_2.main import main + reg2 = main(par) + prediction = pd.concat([reg1, reg2], axis=1) + prediction.index = [method] + if i==0: + df_all = prediction else: - par['prediction'] = f"{grn_models_folder}/{grn_model}.csv" - output = main(par) - output.index = [layer] - - if ii == 0: - score = output - else: - score = pd.concat([score, output], axis=0) - - print("Write output to file", flush=True) - print(grn_model, layer, score) - - print("Write output to file", flush=True) - score.to_csv(par['score']) - + df_all = pd.concat([df_all, prediction]) + df_all.to_csv(f"{par['write_dir']}/{layer}-{par['reg_type']}.csv") + print(df_all) + diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py index 6a1c75ec3..075f29a13 100644 --- a/src/metrics/regression_2/consensus/script.py +++ b/src/metrics/regression_2/consensus/script.py @@ -11,8 +11,8 @@ ## VIASH START par = { 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', - 'grn_folder': 'resources/grn-benchmark/grn_models', - 'grns': 'figr.csv,scenicplus.csv', + 'grn_folder': 'resources/grn-benchmark/grn_models/d0_hvg', + 'grns': 'pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle', 'output': 'resources/grn-benchmark/consensus-num-regulators.json' } ## VIASH END @@ -31,7 +31,7 @@ # Load inferred GRNs grns = [] for filename in par['grns'].split(','): - filepath = os.path.join(par['grn_folder'], filename) + filepath = os.path.join(par['grn_folder'], f'{filename}.csv') gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} A = np.zeros((len(gene_names), len(gene_names)), dtype=float) df = pd.read_csv(filepath, sep=',', header='infer', index_col=0) diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index 352e63cd3..2a09166cc 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -11,23 +11,19 @@ from sklearn.model_selection import GroupKFold, LeaveOneGroupOut from sklearn.linear_model import Ridge from sklearn.metrics import r2_score, mean_squared_error +from util import verbose_print, process_links, verbose_tqdm SEED = 0xCAFE N_POINTS_TO_ESTIMATE_BACKGROUND = 20 -def select_top_links(net, par): - print("Number of links reduced to ", par['max_n_links']) - net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) - net = net_sorted.head(par['max_n_links']).reset_index(drop=True) - return net def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray: gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} A = np.zeros((len(gene_names), len(gene_names)), dtype=float) df = pd.read_csv(filepath, sep=',', header='infer', index_col=0) if 'cell_type' in df.columns: - print('Taking mean of cell type specific grns') + verbose_print(par['verbose'], 'Taking mean of cell type specific grns', 3) df.drop(columns=['cell_type'], inplace=True) df = df.groupby(['source', 'target']).mean().reset_index() @@ -38,8 +34,8 @@ def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.n j = gene_dict[target] A[i, j] = float(weight) if df.shape[0] > par['max_n_links']: - df = select_top_links(df, par) - print(df) + verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3) + df = process_links(df, par) return A @@ -265,7 +261,7 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: random.seed(random_state) lightgbm.LGBMRegressor().set_params(random_state=random_state) - print('Reading input files', flush=True) + verbose_print(par['verbose'], "Reading input files", 3) # Load perturbation data perturbation_data = ad.read_h5ad(par['perturbation_data']) @@ -287,8 +283,7 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: groups = LabelEncoder().fit_transform(perturbation_data.obs.plate_name) - # Load inferred GRN - print(f'Loading GRN', flush=True) + grn = load_grn(par['prediction'], gene_names, par) # Load and standardize perturbation data @@ -313,11 +308,11 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: clip_scores = par['clip_scores'] # Evaluate GRN - print(f'Compute metrics for layer: {layer}', flush=True) + verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) # print(f'Dynamic approach:', flush=True) - print(f'Static approach (theta=0):', flush=True) + verbose_print(par['verbose'], f'Static approach (theta=0):', 3) score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores) - print(f'Static approach (theta=0.5):', flush=True) + verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3) score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores) # print(f'Static approach (theta=1):', flush=True) # score_static_max = static_approach(grn, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) @@ -328,7 +323,6 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: 'static-theta-0.5': [float(score_static_median)] # 'static-theta-1.0': [float(score_static_max)], } - # print(f'Scores on {layer}: {results}') # Add dynamic score if not par['static_only']: @@ -339,6 +333,5 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: # Convert results to DataFrame df_results = pd.DataFrame(results) - df_results['Mean'] = df_results.mean(axis=1) return df_results diff --git a/src/metrics/regression_2/script_all.py b/src/metrics/regression_2/script_all.py index c4aca8656..76173788e 100644 --- a/src/metrics/regression_2/script_all.py +++ b/src/metrics/regression_2/script_all.py @@ -3,91 +3,51 @@ import sys import numpy as np import os -from tqdm import tqdm -from sklearn.preprocessing import StandardScaler +## VIASH START par = { - "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", - 'max_workers': 10, 'reg_type': 'ridge', - 'subsample': 2, - "tf_all": "./resources/prior/tf_all.csv", - "temp_dir": "output", - 'consensus': 'resources/prior/consensus-num-regulators.json', -} - -def create_positive_control(X: np.ndarray, groups: np.ndarray): - grns = [] - for group in tqdm(np.unique(groups), desc="Processing groups"): - X_sub = X[groups == group, :] - X_sub = StandardScaler().fit_transform(X_sub) - grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0] - grns.append(grn) - return np.mean(grns, axis=0) -def create_negative_control(gene_names) -> np.ndarray: - ratio = [.98, .01, 0.01] - n_tf = 400 - net = np.random.choice([0, -1, 1], size=((len(gene_names), n_tf)),p=ratio) - net = pd.DataFrame(net, index=gene_names) - return net + 'read_dir': "resources/grn_models/d0_hvgs", + 'write_dir': "resources/results/d0_hvgs_ridge", + 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'], -print('Reading input data') -perturbation_data = ad.read_h5ad(par["perturbation_data"]) -gene_names = perturbation_data.var_names.to_numpy() -tf_all = np.loadtxt(par['tf_all'], dtype=str) -groups = perturbation_data.obs.cell_type - + "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", + "tf_all": "resources/prior/tf_all.csv", + "min_tf": False, + "max_n_links": 50000, + "apply_tf": "true", + 'layer': 'scgen_pearson', + 'subsample': -2, + 'num_workers': 4, + 'verbose': 1, + 'binarize': True, + 'num_workers': 20, + 'consensus': 'resources/prior/consensus-num-regulators.json', + 'static_only': True, + 'clip_scores': True +} +# VIASH END meta = { - "resources_dir":'./' + "resources_dir": 'src/metrics/regression_2/', + "util": 'src/utils' } sys.path.append(meta["resources_dir"]) +sys.path.append(meta["util"]) from main import main -layers = ['pearson', 'lognorm', 'scgen_pearson', 'scgen_lognorm', 'seurat_lognorm', 'seurat_pearson'] -grn_models = ['scenicplus', 'celloracle', 'figr', 'granie', 'scglue', 'collectri'] -controls = ['negative_control', 'positive_control'] - -os.makedirs(par['temp_dir'], exist_ok=True) -for grn_model in controls + grn_models : - par["score"] = f"{par['temp_dir']}/reg2-{grn_model}.csv" - for ii, layer in enumerate(layers): - par["layer"] = layer - if grn_model=='positive_control': - # print('Inferring GRN') - # net = create_positive_control(perturbation_data.layers[par["layer"]], groups) - - # net = pd.DataFrame(net, index=gene_names, columns=gene_names) - # net = net.loc[:, net.columns.isin(tf_all)] - - # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight') - - # pivoted_net = pivoted_net.rename(columns={'index': 'target'}) - # pivoted_net = pivoted_net[pivoted_net['weight'] != 0] - par['prediction'] = f"{par['temp_dir']}/{layer}_positive_control.csv" - # pivoted_net.to_csv(par['prediction']) - elif grn_model=='negative_control': - # print('Inferring GRN') - # net = create_negative_control(gene_names) - - # pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight') - - # pivoted_net = pivoted_net.rename(columns={'index': 'target'}) - # pivoted_net = pivoted_net[pivoted_net['weight'] != 0] - par['prediction'] = f"{par['temp_dir']}/negative_control.csv" - # pivoted_net.to_csv(par['prediction']) - else: - par['prediction'] = f"resources/grn_models/{grn_model}.csv" - output = main(par) - output.index = [layer] - if ii == 0: - score = output - else: - score = pd.concat([score, output], axis=0) - print("Write output to file", flush=True) - print(grn_model, layer, score) +os.makedirs(par['write_dir'], exist_ok=True) - print("Write output to file", flush=True) - score.to_csv(par['score']) +for i, method in enumerate(par['methods']): + par['prediction'] = f"{par['read_dir']}/{method}.csv" + prediction = main(par) + prediction.index = [method] + if i==0: + df_all = prediction + else: + df_all = pd.concat([df_all, prediction]) + df_all.to_csv(f"{par['write_dir']}/reg2.csv") + print(df_all) + diff --git a/src/robustness_analysis/permute_grn/script.py b/src/robustness_analysis/permute_grn/script.py index ad3eca2b4..da6664ea8 100644 --- a/src/robustness_analysis/permute_grn/script.py +++ b/src/robustness_analysis/permute_grn/script.py @@ -26,12 +26,6 @@ noise = np.random.normal(loc=0, scale=degree * std_dev, size=prediction['weight'].shape) prediction['weight'] += noise -elif type == 'links': # shuffle source-target-weight - print('Permute links') - num_rows_to_permute = int(len(prediction) * degree) - permute_indices = np.random.choice(prediction.index, size=num_rows_to_permute, replace=False) - prediction.loc[permute_indices, 'weight'] = np.random.permutation(prediction.loc[permute_indices, 'weight'].values) - elif type == 'net': # shuffle source-target matrix print('Permute links') @@ -41,7 +35,7 @@ # Fill NaNs with 0 or a value of your choice pivot_df.fillna(0, inplace=True) - # 2. Randomly choose 20% of the matrix to shuffle + # 2. Randomly choose degree% of the matrix to shuffle matrix_flattened = pivot_df.values.flatten() n_elements = len(matrix_flattened) n_shuffle = int(n_elements * degree) diff --git a/src/workflows/run_robustness_analysis/config.novsh.yaml b/src/workflows/run_robustness_analysis/config.vsh.yaml similarity index 100% rename from src/workflows/run_robustness_analysis/config.novsh.yaml rename to src/workflows/run_robustness_analysis/config.vsh.yaml diff --git a/src/workflows/run_robustness_analysis/main.nf b/src/workflows/run_robustness_analysis/main.nf index 1b38effed..4fe38b43b 100644 --- a/src/workflows/run_robustness_analysis/main.nf +++ b/src/workflows/run_robustness_analysis/main.nf @@ -14,7 +14,7 @@ workflow run_wf { // construct list of metrics metrics = [ - // regression_1, + regression_1, regression_2 ] @@ -52,6 +52,7 @@ workflow run_wf { num_workers: "num_workers", consensus: "consensus", tf_all: "tf_all" + ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp ->