diff --git a/runs.ipynb b/runs.ipynb index 9ed0558e8..43b653e7b 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -1,8 +1,32 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Commands" + ] + }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed 257.3 MiB/746.3 MiB (10.5 MiB/s) with 3 file(s) remaining \r" + ] + } + ], + "source": [ + "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark " + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -14,6 +38,37 @@ "grn_models = ['collectri','granie', 'figr', 'celloracle', 'scglue', 'scenicplus']" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# To be classified " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import anndata as ad \n", + "import numpy as np \n", + "adata_rna = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n", + "X = adata_rna.X.todense()\n", + "\n", + "if True:\n", + " print('qc')\n", + " print('Data shape before QC: ', adata_rna.shape)\n", + " # Remove genes with >=90% of zeros\n", + " mask_gene = (np.mean(X == 0, axis=0) >= 0.9)\n", + " adata_rna = adata_rna[:, ~mask_gene]\n", + " # Remove samples with >=90% of zeros\n", + " mask_cells = (np.mean(X == 0, axis=1) >= 0.9)\n", + " adata_rna = adata_rna[~mask_cells, :]\n", + " print('Data shape after QC: ', adata_rna.shape)\n", + "adata_rna.write('resources/grn-benchmark/multiomics_rna_qc.h5ad')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -3079,6 +3134,501 @@ "df_all['Rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", "df_all.style.background_gradient()" ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "df = pd.read_csv('output/scenic_default/expr_mat_adjacencies.tsv', sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "qc\n", + "Data shape before QC: (25551, 22787)\n", + "Data shape after QC: (4632, 5439)\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "atac-emb.h5ad\t\t\t peaks.bed\n", + "atac.h5ad\t\t\t prediction.csv\n", + "ctx_annotation.tsv\t\t pruned_grn.csv\n", + "draft_grn.csv\t\t\t rna-emb.h5ad\n", + "gene2peak.links\t\t\t rna.h5ad\n", + "glue\t\t\t\t rna.loom\n", + "glue.dill\t\t\t supp.genes_vs_tracks.rankings.feather\n", + "glue.genes_vs_tracks.rankings.feather tfs.txt\n", + "guidance.graphml.gz\n" + ] + } + ], + "source": [ + "!ls output/scglue/" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0sourcetargetweight
00IRF9RSAD20.729921
11IRF9SAMD9L0.812882
22IRF9HERC61.465909
33IRF9XRCC20.437663
44IRF9EXOC3L10.875641
...............
593593SP2PDE9A0.929089
594594SP2ANKRD13B2.120112
595595SP2IQANK10.639963
596596SP2FAM120A0.321937
597597SP2GCC2-AS11.377123
\n", + "

598 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 source target weight\n", + "0 0 IRF9 RSAD2 0.729921\n", + "1 1 IRF9 SAMD9L 0.812882\n", + "2 2 IRF9 HERC6 1.465909\n", + "3 3 IRF9 XRCC2 0.437663\n", + "4 4 IRF9 EXOC3L1 0.875641\n", + ".. ... ... ... ...\n", + "593 593 SP2 PDE9A 0.929089\n", + "594 594 SP2 ANKRD13B 2.120112\n", + "595 595 SP2 IQANK1 0.639963\n", + "596 596 SP2 FAM120A 0.321937\n", + "597 597 SP2 GCC2-AS1 1.377123\n", + "\n", + "[598 rows x 4 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('output/scglue/prediction.csv')\n", + "df # default" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0sourcetargetweight
00BCL6FBP10.544815
11BCL6FAM157C1.530639
22BCL6OLIG10.508618
33BCL6BIRC50.915793
44BCL6TMED50.523122
...............
56865686ZNF816FLII0.160327
56875687ZNF816KNOP10.167249
56885688ZNF816SCO10.184201
56895689ZNF816SH2D4B0.072754
56905690ZNF816SLC13A30.313595
\n", + "

5691 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 source target weight\n", + "0 0 BCL6 FBP1 0.544815\n", + "1 1 BCL6 FAM157C 1.530639\n", + "2 2 BCL6 OLIG1 0.508618\n", + "3 3 BCL6 BIRC5 0.915793\n", + "4 4 BCL6 TMED5 0.523122\n", + "... ... ... ... ...\n", + "5686 5686 ZNF816 FLII 0.160327\n", + "5687 5687 ZNF816 KNOP1 0.167249\n", + "5688 5688 ZNF816 SCO1 0.184201\n", + "5689 5689 ZNF816 SH2D4B 0.072754\n", + "5690 5690 ZNF816 SLC13A3 0.313595\n", + "\n", + "[5691 rows x 4 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df_relaxed = pd.read_csv('output/scglue/prediction.csv')\n", + "df_relaxed " + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0sourcetargetweight
00ACAA1ARHGEF21.038998
11ACAA1TUBA1A1.779427
22ACAA1HMBOX14.849021
33ACAA1BAX1.048583
44ACAA1SAMHD10.571366
...............
1681616816ZXDCPRKAA10.663177
1681716817ZXDCARMC80.551332
1681816818ZXDCANKRD261.600681
1681916819ZXDCYTHDF20.550480
1682016820ZXDCHTT0.626333
\n", + "

16821 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 source target weight\n", + "0 0 ACAA1 ARHGEF2 1.038998\n", + "1 1 ACAA1 TUBA1A 1.779427\n", + "2 2 ACAA1 HMBOX1 4.849021\n", + "3 3 ACAA1 BAX 1.048583\n", + "4 4 ACAA1 SAMHD1 0.571366\n", + "... ... ... ... ...\n", + "16816 16816 ZXDC PRKAA1 0.663177\n", + "16817 16817 ZXDC ARMC8 0.551332\n", + "16818 16818 ZXDC ANKRD26 1.600681\n", + "16819 16819 ZXDC YTHDF2 0.550480\n", + "16820 16820 ZXDC HTT 0.626333\n", + "\n", + "[16821 rows x 4 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_scenic = pd.read_csv('output/scenic/scenic.csv')\n", + "df_scenic " + ] } ], "metadata": { diff --git a/batch.sh b/scripts/batch_scenic.sh similarity index 81% rename from batch.sh rename to scripts/batch_scenic.sh index 4861ee8fb..e8da2a0ec 100644 --- a/batch.sh +++ b/scripts/batch_scenic.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --time=12:00:00 -#SBATCH --job-name=scenic_donor_0 +#SBATCH --job-name=scenic #SBATCH --output=logs/%j.out #SBATCH --error=logs/%j.err #SBATCH --mail-type=END #SBATCH --mail-user=jalil.nourisa@gmail.com -#SBATCH --cpus-per-task=10 +#SBATCH --cpus-per-task=20 #SBATCH --mem=64G singularity exec ../../images/scenic python src/methods/single_omics/scenic/script.py diff --git a/scripts/batch_scglue.sh b/scripts/batch_scglue.sh new file mode 100644 index 000000000..147ad7728 --- /dev/null +++ b/scripts/batch_scglue.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --time=12:00:00 +#SBATCH --job-name=scglue_donor_0 +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --mail-type=END +#SBATCH --mail-user=jalil.nourisa@gmail.com +#SBATCH --cpus-per-task=20 +#SBATCH --mem=64G + +singularity exec ../../images/scglue python src/methods/multi_omics/scglue/script.py diff --git a/src/exp_analysis/config.vsh.yaml b/src/exp_analysis/config.vsh.yaml index 5b3336052..4e50b6a2a 100644 --- a/src/exp_analysis/config.vsh.yaml +++ b/src/exp_analysis/config.vsh.yaml @@ -47,6 +47,12 @@ functionality: required: false direction: output default: output/stats.json + - name: --reg_weight_distribution + type: file + required: false + direction: output + default: output/reg_weight_distribution.png + resources: - type: python_script diff --git a/src/exp_analysis/helper.py b/src/exp_analysis/helper.py index 975b8bb5b..68e5f0809 100644 --- a/src/exp_analysis/helper.py +++ b/src/exp_analysis/helper.py @@ -11,7 +11,6 @@ def degree_centrality(net, source='source', target='target', normalize=False): total_targets = net[target].nunique() counts = counts/total_targets return counts - def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs): # Step 1: Sort the data sorted_data = np.sort(data) @@ -25,11 +24,14 @@ def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs): else: fig = None ax.step(sorted_data, cdf, where='post', label=title, **kwdgs) - ax.set_xlabel('Data') + ax.set_xlabel('Weight') ax.set_ylabel('Cumulative Density') ax.set_title(title) - # ax.grid(True) + ax.grid(True) return fig, ax + + + class Connectivity: def __init__(self, net, **kwargs): self.out_deg = degree_centrality(net, source='source', target='target', **kwargs) diff --git a/src/exp_analysis/script.py b/src/exp_analysis/script.py index a9b810c4f..c4f6b5522 100644 --- a/src/exp_analysis/script.py +++ b/src/exp_analysis/script.py @@ -5,23 +5,18 @@ ## VIASH START par = { - "perturbation_data": "resources/grn-benchmark/pertubation_data.h5ad", - "prediction": "resources/grn-benchmark/grn_models/figr.csv", + "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", + "prediction": "output/scenic_default/scenic.csv", # "peak_gene_net": "resources/grn-benchmark/peak_gene_models/figr.csv", - "annot_peak_database": "resources/grn-benchmark/supp/annot_peak_database.csv", - "annot_gene_database": "resources/grn-benchmark/supp/annot_gene_database.csv", - "hvgs": "resources/grn-benchmark/supp/hvgs.txt" - + "annot_peak_database": "resources/supplementary/annot_peak_database.csv", + "annot_gene_database": "resources/supplementary/annot_gene_database.csv", + "stats": "output/stats.json", + "reg_weight_distribution": "output/reg_weight_distribution.png", + "tf_gene_indegee_fig": "output/tf_gene_indegee_fig.png", + "tf_gene_outdegee_fig": "output/tf_gene_outdegee_fig.png" } ## VIASH END -# meta = { -# "resources_dir":'resources' -# } -sys.path.append(meta["resources_dir"]) -from helper import Explanatory_analysis - -print('Reading input files', flush=True) perturbation_data = ad.read_h5ad(par["perturbation_data"]) prediction = pd.read_csv(par["prediction"]) @@ -29,6 +24,17 @@ # annot_peak_database = pd.read_csv(par["annot_peak_database"]) # hvgs = pd.read_csv(par["hvgs"]) +meta = { + "resources_dir":'src/exp_analysis/' +} +sys.path.append(meta["resources_dir"]) +from helper import Explanatory_analysis, plot_cumulative_density + +print('Plotting CMD of weight: ',par['reg_weight_distribution'], flush=True) +fig, ax = plot_cumulative_density(prediction.weight, title='CMD of reg. weight') +fig.savefig(par['reg_weight_distribution']) + +print('Reading input files', flush=True) # peak_gene_net['source'] = peak_gene_net['peak'] info_obj = Explanatory_analysis(net=prediction) print("Calculate basic stats") diff --git a/src/exp_analysis/test.sh b/src/exp_analysis/test.sh index 004b91d18..3e97b0053 100644 --- a/src/exp_analysis/test.sh +++ b/src/exp_analysis/test.sh @@ -1,4 +1,4 @@ viash run src/exp_analysis/config.vsh.yaml -- \ --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \ - --prediction resources/grn_models/baselines/positive_control.csv \ + --prediction output/scenic/scenic.csv \ diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py index b71a208e2..ad3ba5f19 100644 --- a/src/methods/multi_omics/scglue/main.py +++ b/src/methods/multi_omics/scglue/main.py @@ -109,7 +109,7 @@ def training(par): nx.write_graphml(guidance, f"{par['temp_dir']}/guidance.graphml.gz") -def cis_inference(par): +def run_grn(par): ''' Infers gene2peak connections ''' rna = ad.read_h5ad(f"{par['temp_dir']}/rna-emb.h5ad") @@ -155,13 +155,12 @@ def cis_inference(par): np.savetxt(f"{par['temp_dir']}/tfs.txt", tfs, fmt="%s") # Construct the command - command = ( - f"pyscenic grn {par['temp_dir']}/rna.loom {par['temp_dir']}/tfs.txt " - f"-o {par['temp_dir']}/draft_grn.csv --seed 0 --num_workers {par['num_workers']} " - "--cell_id_attribute obs_id --gene_attribute name" - ) - - result = subprocess.run(command, shell=True, capture_output=True, text=True) + command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom", + f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv", + '--seed', '0', '--num_workers', f"{par['num_workers']}", + '--cell_id_attribute', 'obs_id', '--gene_attribute', 'name'] + print('Run grn') + result = subprocess.run(command, check=True) print("Output:") print(result.stdout) @@ -193,7 +192,7 @@ def cis_inference(par): n_samples=0 ) - ### Prune coexpression network using cis-regulatory ranking + ### Prepare data for pruning gene2tf_rank_glue.columns = gene2tf_rank_glue.columns + "_glue" gene2tf_rank_supp.columns = gene2tf_rank_supp.columns + "_supp" @@ -215,19 +214,29 @@ def cis_inference(par): orthologous_identity=1.0, description="placeholder" ).to_csv(f"{par['temp_dir']}/ctx_annotation.tsv", sep="\t", index=False) - +def prune_grn(par): # Construct the command #TODO: be sure that obs_id is in obs and name is in var print("Run pscenic ctx", flush=True) - command = ( - f" pyscenic ctx {par['temp_dir']}/draft_grn.csv {par['temp_dir']}/glue.genes_vs_tracks.rankings.feather " - f" {par['temp_dir']}/supp.genes_vs_tracks.rankings.feather --annotations_fname {par['temp_dir']}/ctx_annotation.tsv " - f" --expression_mtx_fname {par['temp_dir']}/rna.loom --output {par['temp_dir']}/pruned_grn.csv " - f" --rank_threshold 500 --min_genes 1 --num_workers {par['num_workers']} " - " --cell_id_attribute obs_id --gene_attribute name" - ) + command = [ + "pyscenic", "ctx", + f"{par['temp_dir']}/draft_grn.csv", + f"{par['temp_dir']}/glue.genes_vs_tracks.rankings.feather", + f"{par['temp_dir']}/supp.genes_vs_tracks.rankings.feather", + "--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv", + "--expression_mtx_fname", f"{par['temp_dir']}/rna.loom", + "--output", f"{par['temp_dir']}/pruned_grn.csv", + "--rank_threshold", "10000", + "--auc_threshold", "0.1", + "--nes_threshold", "2", + "--mask_dropouts", + "--min_genes", "1", + "--num_workers", f"{par['num_workers']}", + "--cell_id_attribute", "obs_id", + "--gene_attribute", "name" + ] - result = subprocess.run(command, shell=True, capture_output=True, text=True) + result = subprocess.run(command, check=True) print("Output:") print(result.stdout) @@ -262,15 +271,16 @@ def main(par): print("Number of GPUs:", torch.cuda.device_count()) os.makedirs(par['temp_dir'], exist_ok=True) - print('Reading input files', flush=True) - rna = ad.read_h5ad(par['multiomics_rna']) - atac = ad.read_h5ad(par['multiomics_atac']) - - print('Preprocess data', flush=True) - preprocess(rna, atac, par) - print('Train a model', flush=True) - training(par) - cis_inference(par) + # print('Reading input files', flush=True) + # rna = ad.read_h5ad(par['multiomics_rna']) + # atac = ad.read_h5ad(par['multiomics_atac']) + + # print('Preprocess data', flush=True) + # preprocess(rna, atac, par) + # print('Train a model', flush=True) + # training(par) + # run_grn(par) + prune_grn(par) print('Curate predictions', flush=True) pruned_grn = pd.read_csv( f"{par['temp_dir']}/pruned_grn.csv", header=None, skiprows=3, diff --git a/src/methods/multi_omics/scglue/script.py b/src/methods/multi_omics/scglue/script.py index 25728e583..47a11f38c 100644 --- a/src/methods/multi_omics/scglue/script.py +++ b/src/methods/multi_omics/scglue/script.py @@ -5,17 +5,17 @@ ## VIASH START par = { - "multiomics_rna": "resources_test/grn-benchmark/multiomics_rna.h5ad", - "multiomics_atac": "resources_test/grn-benchmark/multiomics_atac.h5ad", + "multiomics_rna": "resources/grn-benchmark/multiomics_rna_0.h5ad", + "multiomics_atac": "resources/grn-benchmark/multiomics_atac_0.h5ad", "motif_file": "resources/supplementary/JASPAR2022-hg38.bed.gz", "annotation_file": "resources/supplementary/gencode.v45.annotation.gtf.gz", "temp_dir": 'output/scglue/', - "num_workers": 10, - "prediction": "output/prediction.csv", + "num_workers": 20, + "prediction": "output/scglue/prediction.csv", } ## VIASH END meta = { - "resources_dir": 'resources' + "resources_dir": 'src/methods/multi_omics/scglue/' } diff --git a/src/methods/single_omics/ennet/script.R b/src/methods/single_omics/ennet/script.R index 87e787795..64e5c076a 100644 --- a/src/methods/single_omics/ennet/script.R +++ b/src/methods/single_omics/ennet/script.R @@ -18,16 +18,16 @@ ad <- anndata::read_h5ad(par$multiomics_rna) X <- as.matrix(ad$X) gene_names <- colnames(ad) -# Remove genes with > 90% of zeros -zero_proportion <- colMeans(X == 0) -mask <- (zero_proportion <= 0.9) -X <- X[, mask] -gene_names <- gene_names[mask] - -# Remove samples with > 90% of zeros -zero_proportion <- rowMeans(X == 0) -mask <- (zero_proportion <= 0.9) -X <- X[mask,] +# # Remove genes with > 90% of zeros +# zero_proportion <- colMeans(X == 0) +# mask <- (zero_proportion <= 0.9) +# X <- X[, mask] +# gene_names <- gene_names[mask] + +# # Remove samples with > 90% of zeros +# zero_proportion <- rowMeans(X == 0) +# mask <- (zero_proportion <= 0.9) +# X <- X[mask,] # Load list of putative TFs dat <- read.csv(par$tf_all, header = FALSE) diff --git a/src/methods/single_omics/genie3/script.py b/src/methods/single_omics/genie3/script.py index a3ab20ac2..9e632553e 100644 --- a/src/methods/single_omics/genie3/script.py +++ b/src/methods/single_omics/genie3/script.py @@ -22,14 +22,15 @@ gene_names = adata_rna.var.gene_ids.index.to_numpy() X = adata_rna.X.toarray() -# Remove genes with >=90% of zeros -mask = (np.mean(X == 0, axis=0) >= 0.9) -X = X[:, ~mask] -gene_names = gene_names[~mask] - -# Remove samples with >=90% of zeros -mask = (np.mean(X == 0, axis=1) >= 0.9) -adata_rna = X[~mask, :] +if False: + # Remove genes with >=90% of zeros + mask = (np.mean(X == 0, axis=0) >= 0.9) + X = X[:, ~mask] + gene_names = gene_names[~mask] + + # Remove samples with >=90% of zeros + mask = (np.mean(X == 0, axis=1) >= 0.9) + adata_rna = X[~mask, :] # Load list of putative TFs df = pd.read_csv(par["tf_all"], header=None, names=['gene_name']) diff --git a/src/methods/single_omics/grnboost2/script.py b/src/methods/single_omics/grnboost2/script.py index 510d273fb..cf983a9e2 100644 --- a/src/methods/single_omics/grnboost2/script.py +++ b/src/methods/single_omics/grnboost2/script.py @@ -22,14 +22,15 @@ gene_names = adata_rna.var.gene_ids.index.to_numpy() X = adata_rna.X.toarray() -# Remove genes with >=90% of zeros -mask = (np.mean(X == 0, axis=0) >= 0.9) -X = X[:, ~mask] -gene_names = gene_names[~mask] - -# Remove samples with >=90% of zeros -mask = (np.mean(X == 0, axis=1) >= 0.9) -adata_rna = X[~mask, :] +if False: + # Remove genes with >=90% of zeros + mask = (np.mean(X == 0, axis=0) >= 0.9) + X = X[:, ~mask] + gene_names = gene_names[~mask] + + # Remove samples with >=90% of zeros + mask = (np.mean(X == 0, axis=1) >= 0.9) + adata_rna = X[~mask, :] # Load list of putative TFs df = pd.read_csv(par["tf_all"], header=None, names=['gene_name']) diff --git a/src/methods/single_omics/pidc/config.vsh.yaml b/src/methods/single_omics/pidc/config.vsh.yaml new file mode 100644 index 000000000..80db737e3 --- /dev/null +++ b/src/methods/single_omics/pidc/config.vsh.yaml @@ -0,0 +1,50 @@ +functionality: + name: pidc + info: + label: pidc + summary: "GRN inference using PIDC" + description: | + GRN inference using PIDC. + documentation_url: https://rdrr.io/github/hmutpw/PIDC/ + arguments: + - name: --multiomics_rna + type: file + example: resources/grn-benchmark/multiomics_rna.h5ad + info: + label: multiomics_rna + summary: "Multiomics RNA data" + file_type: rds + columns: + - name: dummpy + type: string + required: false + required: true + must_exist: true + - name: --prediction + __merge__: ../../api/file_prediction.yaml + required: true + direction: output + - name: --max_n_links + type: integer + default: 50000 + - name: --temp_dir + type: file + direction: output + default: 'output/pidc' + - name: --num_workers + type: integer + direction: input + default: 4 + resources: + - type: python_script + path: script.py + - type: file + path: pidc.jl + +platforms: + - type: docker + image: apassemi/pidc:latest + - type: native + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/single_omics/pidc/pidc.jl b/src/methods/single_omics/pidc/pidc.jl new file mode 100644 index 000000000..fd4a51808 --- /dev/null +++ b/src/methods/single_omics/pidc/pidc.jl @@ -0,0 +1,11 @@ +using NetworkInference +using LightGraphs + +algorithm = PIDCNetworkInference() +dataset_name = string(ARGS[1]) + +genes = get_nodes(dataset_name); + +network = InferredNetwork(algorithm, genes); + +write_network_file(string(ARGS[2]), network); diff --git a/src/methods/single_omics/pidc/script.py b/src/methods/single_omics/pidc/script.py new file mode 100644 index 000000000..24ada09e3 --- /dev/null +++ b/src/methods/single_omics/pidc/script.py @@ -0,0 +1,58 @@ +import os +import subprocess +import pathlib + +import anndata +import numpy as np +import pandas as pd + + +## VIASH START +par = { + 'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad', + 'tf_all': 'resources_test/prior/tf_all.csv', + 'prediction': 'output/pidc/prediction.csv', + 'temp_dir': 'output/pidc', + 'max_n_links': 50000 +} +## VIASH END + +# Load scRNA-seq data +adata_rna = anndata.read_h5ad(par['multiomics_rna']) +gene_names = adata_rna.var.gene_ids.index.to_numpy() +df_rna = pd.DataFrame(adata_rna.X.toarray(), index=adata_rna.obs.index, columns=adata_rna.var.index) + +# Remove genes with >=90% of zeros +# mask = (np.mean(df_rna.to_numpy() == 0, axis=0) >= 0.9) +# df_rna = df_rna.loc[:, ~mask] + +# # Remove samples with >=90% of zeros +# mask = (np.mean(df_rna.to_numpy() == 0, axis=1) >= 0.9) +# df_rna = df_rna.loc[~mask, :] + +# (genes x samples) format is needed +df_rna = df_rna.transpose() + +# Save expression data +if not os.path.exists(os.path.join(par['temp_dir'], 'multiomics_rna.tsv')): + os.makedirs(par['temp_dir'], exist_ok=True) + df_rna.to_csv(os.path.join(par['temp_dir'], 'multiomics_rna.tsv'), sep='\t', index=True, header=True) + +# Run PIDC +# meta = { +# "resources_dir":'src/methods/single_omics/pidc/' +# } +JL_SCRIPT_PATH = os.path.join(meta['resources_dir'], 'pidc.jl') +subprocess.run([ + 'julia', + JL_SCRIPT_PATH, + os.path.join(par['temp_dir'], 'multiomics_rna.tsv'), + os.path.join(par['temp_dir'], 'output.tsv'), +]) + +# Re-format output +df = pd.read_csv(os.path.join(par['temp_dir'], 'output.tsv'), header=None, names=['source', 'target', 'weight'], sep='\t') +df = df.head(par['max_n_links']) +df.to_csv(par['prediction'], header=True, sep=',') + +print('Finished.') diff --git a/src/methods/single_omics/pidc/test.sh b/src/methods/single_omics/pidc/test.sh new file mode 100644 index 000000000..8c47f3a0d --- /dev/null +++ b/src/methods/single_omics/pidc/test.sh @@ -0,0 +1,2 @@ +viash build src/methods/pidc/config.vsh.yaml -p docker -o bin/pidc && bin/pidc/pidc --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv +#viash run src/methods/pidc/config.vsh.yaml -p docker -- ---setup build && bin/pidc/pidc --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/portia/script.py b/src/methods/single_omics/portia/script.py index 9aecdaf39..542df3b34 100644 --- a/src/methods/single_omics/portia/script.py +++ b/src/methods/single_omics/portia/script.py @@ -23,13 +23,15 @@ X = adata_rna.X.toarray() if scipy.sparse.issparse(adata_rna.X) else adata_rna.X # Remove genes with >=90% of zeros -mask = (np.mean(X == 0, axis=0) >= 0.9) -X = X[:, ~mask] -gene_names = gene_names[~mask] - -# Remove samples with >=90% of zeros -mask = (np.mean(X == 0, axis=1) >= 0.9) -adata_rna = X[~mask, :] +if False: + # Remove genes with >=90% of zeros + mask = (np.mean(X == 0, axis=0) >= 0.9) + X = X[:, ~mask] + gene_names = gene_names[~mask] + + # Remove samples with >=90% of zeros + mask = (np.mean(X == 0, axis=1) >= 0.9) + adata_rna = X[~mask, :] # Load list of putative TFs df = pd.read_csv(par['tf_all'], header=None, names=['gene_name']) diff --git a/src/methods/single_omics/ppcor/script.R b/src/methods/single_omics/ppcor/script.R index 7d458e92f..c05516136 100644 --- a/src/methods/single_omics/ppcor/script.R +++ b/src/methods/single_omics/ppcor/script.R @@ -18,14 +18,15 @@ geneNames <- colnames(inputExpr) colnames(inputExpr) <- c(geneNames) X <- as.matrix(inputExpr) -# Keep genes with less than 10% of zeros -mask <- (apply(X, 2, function(x) mean(x != 0)) >= 0.1) -X <- X[, mask] -geneNames <- geneNames[mask] - -# Keep samples with less than 10% of zeros -mask <- (apply(X, 1, function(x) mean(x != 0)) >= 0.1) -X <- X[mask,] + +# # Keep genes with less than 10% of zeros +# mask <- (apply(X, 2, function(x) mean(x != 0)) >= 0.1) +# X <- X[, mask] +# geneNames <- geneNames[mask] + +# # Keep samples with less than 10% of zeros +# mask <- (apply(X, 1, function(x) mean(x != 0)) >= 0.1) +# X <- X[mask,] # Run GRN inference method pcorResults = pcor(x = X, method = "pearson") diff --git a/src/methods/single_omics/scenic/script.py b/src/methods/single_omics/scenic/script.py index 28f4cd46d..fb244917f 100644 --- a/src/methods/single_omics/scenic/script.py +++ b/src/methods/single_omics/scenic/script.py @@ -1,14 +1,8 @@ import os - import anndata import numpy as np import pandas as pd -# from contextlib_chdir import chdir import subprocess -# from urllib.request import urlretrieve -import zipfile -from arboreto.algo import grnboost2 -from distributed import Client import ast # wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather @@ -26,150 +20,77 @@ 'motif_annotation': 'resources_local/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl', 'genes_vs_motifs_10k': 'resources_local/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather', 'genes_vs_motifs_500': 'resources_local/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather', - 'max_n_links': 50000, 'seed': "32" } ## VIASH END os.makedirs(par['temp_dir'], exist_ok=True) -out_dir = 'resources_local' -RANKINGS_DB_PATH = os.path.join(out_dir, 'cistarget-db', 'db.regions_vs_motifs.rankings.feather') -SCORES_DB_PATH = os.path.join(out_dir, 'cistarget-db', 'db.regions_vs_motifs.scores.feather') - -# if not (os.path.exists(RANKINGS_DB_PATH) and os.path.exists(SCORES_DB_PATH)): - -# # Download create_cisTarget_databases -# os.makedirs(os.path.join(out_dir, 'cistarget-db'), exist_ok=True) -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'create_cisTarget_databases')): -# with chdir(os.path.join(out_dir, 'cistarget-db')): -# subprocess.run(['git', 'clone', 'https://github.com/aertslab/create_cisTarget_databases']) - -# # Download cluster-buster -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'cbust')): -# urlretrieve('https://resources.aertslab.org/cistarget/programs/cbust', os.path.join(out_dir, 'cistarget-db', 'cbust')) -# subprocess.run(['chmod', 'a+x', os.path.join(out_dir, 'cistarget-db', 'cbust')]) - -# # Download motif collection -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public')): -# urlretrieve( -# 'https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/v10nr_clust_public.zip', -# os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public.zip') -# ) -# with zipfile.ZipFile(os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public.zip'), 'r') as zip_ref: -# zip_ref.extractall(os.path.join(out_dir, 'cistarget-db')) - -# # Download chromosome sizes -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.chrom.sizes')): -# urlretrieve( -# 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes', -# os.path.join(out_dir, 'cistarget-db', 'hg38.chrom.sizes') -# ) - -# # Download reference genome -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.fa')): -# print('Downloading reference genome', flush=True) -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.fa.gz')): -# urlretrieve( -# 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz', -# os.path.join(out_dir, 'cistarget-db', 'hg38.fa.gz') -# ) -# with gzip.open(os.path.join(out_dir, 'cistarget-db', 'hg38.fa.gz'), 'rb') as f_in: -# with open(os.path.join(out_dir, 'cistarget-db', 'hg38.fa'), 'wb') as f_out: -# shutil.copyfileobj(f_in, f_out) - -# # Prepare fasta from consensus regions -# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.with_1kb_bg_padding.fa')): -# subprocess.run([ -# os.path.join(out_dir, 'cistarget-db', 'create_cisTarget_databases', 'create_fasta_with_padded_bg_from_bed.sh'), -# os.path.join(out_dir, 'cistarget-db', 'hg38.fa'), -# os.path.join(out_dir, 'cistarget-db', 'hg38.chrom.sizes'), -# os.path.join(out_dir, 'consensus_peak_calling', 'consensus_regions.bed'), -# os.path.join(out_dir, 'cistarget-db', 'hg38.with_1kb_bg_padding.fa'), -# '1000', -# 'yes' -# ]) - -# # Create cistarget databases -# with open(os.path.join(out_dir, 'cistarget-db', 'motifs.txt'), 'w') as f: -# for filename in os.listdir(os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public', 'singletons')): -# f.write(f'{filename}\n') -# with chdir(os.path.join(out_dir, 'cistarget-db')): -# subprocess.run([ -# 'python', -# os.path.join(out_dir, 'cistarget-db', 'create_cisTarget_databases', 'create_cistarget_motif_databases.py'), -# '-f', os.path.join(out_dir, 'cistarget-db', 'hg38.with_1kb_bg_padding.fa'), -# '-M', os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public', 'singletons'), -# '-m', os.path.join(out_dir, 'cistarget-db', 'motifs.txt'), -# '-c', os.path.join(out_dir, 'cistarget-db', 'cbust'), -# '-o', 'db', -# '--bgpadding', '1000', -# '-t', str(par['num_workers']) -# ]) - - - - -# Load scRNA-seq data -adata_rna = anndata.read_h5ad(par['multiomics_rna']) -adata_rna = adata_rna[adata_rna.obs.donor_id=='donor_0',] #TODO: to go -gene_names = adata_rna.var.gene_ids.index.to_numpy() -X = adata_rna.X.toarray() - # Load list of putative TFs # df = pd.read_csv(par["tf_all"], header=None, names=['gene_name']) # tfs = set(list(df['gene_name'])) # tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)] -# format output -expression_data = os.path.join(par['temp_dir'], "expression_data.tsv") - -pd.DataFrame(X, columns=gene_names).to_csv(expression_data, sep='\t', index=False) expr_mat_adjacencies = os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv") - - -print('Run grn') -command = [ - "pyscenic", "grn", - "--num_workers", par['max_workers'], - "--seed", par['seed'], - "-o", expr_mat_adjacencies, - expression_data, - par['tf_all'], - -] -subprocess.run(command, check=True) - - -print('Run prune') +expression_data = os.path.join(par['temp_dir'], "expression_data.tsv") regulons = f"{par['temp_dir']}/regulons.csv" -command = [ - "pyscenic", "ctx", - expr_mat_adjacencies, par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'], - "--annotations_fname", par['motif_annotation'], - "--expression_mtx_fname", expression_data, - "--mode", "custom_multiprocessing", - "--output", regulons, - "--num_workers", par['max_workers'] -] -subprocess.run(command, check=True) - -print('Format regulons') -regulons = pd.read_csv(regulons, index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:] - -def format_it(df): - values = df['TargetGenes'].values[0] - genes = [] - weights = [] - for value in ast.literal_eval(values): - genes.append(value[0]) - weights.append(value[1]) - return pd.DataFrame({'target':genes, 'weight':weights}) -grn = regulons.groupby('index').apply(lambda df: format_it(df)).reset_index().rename(columns={'index':'source'}) -network = grn[['source','target','weight']] - +def format_data(par): + print('Read data') + adata_rna = anndata.read_h5ad(par['multiomics_rna']) + gene_names = adata_rna.var_names + pd.DataFrame(adata_rna.X.todense(), columns=gene_names).to_csv(expression_data, sep='\t', index=False) + + +def run_grn(par): + print('Run grn') + command = [ + "pyscenic", "grn", + "--num_workers", par['max_workers'], + "--seed", par['seed'], + "-o", expr_mat_adjacencies, + "--method", "grnboost2", + expression_data, + par['tf_all'] + ] + subprocess.run(command, check=True) + +def prune_grn(par): + print('Run prune') + + command = [ + "pyscenic", "ctx", + expr_mat_adjacencies, par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'], + "--annotations_fname", par['motif_annotation'], + "--expression_mtx_fname", expression_data, + "--mode", "custom_multiprocessing", + "--output", regulons, + "--num_workers", par['max_workers'] + ] + + subprocess.run(command, check=True) +def format_grn(par): + + print('Format regulons') + regulons_df = pd.read_csv(regulons, index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:] + + def format_it(df): + values = df['TargetGenes'].values[0] + genes = [] + weights = [] + for value in ast.literal_eval(values): + genes.append(value[0]) + weights.append(value[1]) + return pd.DataFrame({'target':genes, 'weight':weights}) + grn = regulons_df.groupby('index').apply(lambda df: format_it(df)).reset_index().rename(columns={'index':'source'}) + network = grn[['source','target','weight']] + return network + +format_data(par) +run_grn(par) +prune_grn(par) +network = format_grn(par) network.to_csv(par['prediction'], sep=',') print('Finished.') diff --git a/src/methods/single_omics/scsgl/script.py b/src/methods/single_omics/scsgl/script.py index 4f61c5b51..11cc9cc5d 100644 --- a/src/methods/single_omics/scsgl/script.py +++ b/src/methods/single_omics/scsgl/script.py @@ -30,13 +30,13 @@ from pysrc.graphlearning import learn_signed_graph # Remove genes with >=90% of zeros -mask = (np.mean(X == 0, axis=0) >= 0.75) -X = X[:, ~mask] -gene_names = gene_names[~mask] +# mask = (np.mean(X == 0, axis=0) >= 0.75) +# X = X[:, ~mask] +# gene_names = gene_names[~mask] -# Remove samples with >=90% of zeros -mask = (np.mean(X == 0, axis=1) >= 0.75) -X = X[~mask, :] +# # Remove samples with >=90% of zeros +# mask = (np.mean(X == 0, axis=1) >= 0.75) +# X = X[~mask, :] # Run scSGL print('Starting scSGL', flush=True) diff --git a/src/methods/single_omics/tigress/script.R b/src/methods/single_omics/tigress/script.R index 7427c38dc..766c768e7 100644 --- a/src/methods/single_omics/tigress/script.R +++ b/src/methods/single_omics/tigress/script.R @@ -19,16 +19,16 @@ X <- as.matrix(ad$X) gene_names <- colnames(ad) # Remove genes with > 90% of zeros -zero_proportion <- colMeans(X == 0) -mask <- (zero_proportion <= 0.9) -X <- X[, mask] -gene_names <- gene_names[mask] -colnames(X) <- gene_names - -# Remove samples with > 90% of zeros -zero_proportion <- rowMeans(X == 0) -mask <- (zero_proportion <= 0.9) -X <- X[mask,] +# zero_proportion <- colMeans(X == 0) +# mask <- (zero_proportion <= 0.9) +# X <- X[, mask] +# gene_names <- gene_names[mask] +# colnames(X) <- gene_names + +# # Remove samples with > 90% of zeros +# zero_proportion <- rowMeans(X == 0) +# mask <- (zero_proportion <= 0.9) +# X <- X[mask,] # Load list of putative TFs dat <- read.csv(par$tf_all, header = FALSE) diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml index a6fefcba0..25729eb86 100644 --- a/src/metrics/regression_1/config.vsh.yaml +++ b/src/metrics/regression_1/config.vsh.yaml @@ -9,10 +9,11 @@ functionality: Calculates R2 score using regression approach 1. arguments: - name: --min_tf - type: integer + type: boolean direction: input description: calculate the scores for the given min tfs in addition to the default required: false + default: false resources: - type: python_script path: script.py diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index c5de93f7a..a94315cb2 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -152,8 +152,8 @@ def regression_1( net_sub = net[net.cell_type==cell_type] else: net_sub = net - - net_sub = select_top_links(net_sub, par) #only top links are considered + if len(net_sub)>par['max_n_links']: + net_sub = select_top_links(net_sub, par) #only top links are considered prturb_adata_sub = prturb_adata[prturb_adata.obs.cell_type==cell_type,:] y_true_sub, y_pred_sub = cross_validation(net_sub, prturb_adata_sub, par) @@ -242,7 +242,7 @@ def main(par): print(f'Compute metrics for layer: {layer}', flush=True) tfs_cases = [-1] if par['min_tf']: - tfs_cases += par['min_tf'] + tfs_cases += 30 layer_results = {} # Store results for this layer for exclude_missing_genes in [False, True]: # two settings on target gene for tf_n in tfs_cases: # two settings on tfs diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py index cba39bbdb..646d2b115 100644 --- a/src/metrics/regression_1/script.py +++ b/src/metrics/regression_1/script.py @@ -6,17 +6,22 @@ ## VIASH START par = { "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", - "prediction": "resources/grn-benchmark/grn_models/collectri.csv", + "tf_all": "resources/prior/tf_all.csv", + "prediction": "output/scenic/scenic.csv", + "method_id": "scenic", + "min_tf": False, + "max_n_links": 50000, + "apply_tf": "true", 'score': 'output/score.h5ad', 'reg_type': 'ridge', 'layer': 'pearson', 'subsample': 200, 'max_workers': 4, } -print(par) - - ## VIASH END +meta = { + "resources_dir":'src/metrics/regression_1/' +} sys.path.append(meta["resources_dir"]) from main import main diff --git a/src/workflows/run_benchmark_single_omics/___main.nf b/src/workflows/run_benchmark_single_omics/___main.nf deleted file mode 100644 index 2fa73958f..000000000 --- a/src/workflows/run_benchmark_single_omics/___main.nf +++ /dev/null @@ -1,315 +0,0 @@ -workflow auto { - findStatesTemp(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of methods - methods = [ - portia - ] - - // construct list of metrics - metrics = [ - regression_1 - ] - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - // | extract_metadata.run( - // fromState: [input: "input_solution"], - // toState: { id, output, state -> - // state + [ - // dataset_uns: readYaml(output.output).uns - // ] - // } - // ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = dataset_ch - - // run all methods - | runEach( - components: methods, - - // use the 'filter' argument to only run a method on the normalisation the component is asking for - // filter: { id, state, comp -> - // def norm = state.dataset_uns.normalization_id - // def pref = comp.config.info.preferred_normalization - // // if the preferred normalisation is none at all, - // // we can pass whichever dataset we want - // def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - // def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) - - // method_check && norm_check - // }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.name - }, - - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: { id, state, comp -> - def new_args = [ - multiomics_rna: state.multiomics_rna, - tf_all: state.tf_all, - ] - if (comp.config.info.type == "control_method") { - new_args.input_solution = state.input_solution - } - new_args - }, - - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.name, - prediction: output.prediction - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - perturbation_data: "perturbation_data", - prediction: "prediction", - subsample: "subsample", - reg_type: "reg_type", - max_workers: "max_workers", - consensus: "consensus", - tf_all: "tf_all" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.name, - metric_output: output.score - ] - } - ) - - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - - // extract the dataset metadata - // dataset_meta_ch = dataset_ch - // // // only keep one of the normalization methods - // // | filter{ id, state -> - // // state.dataset_uns.normalization_id == "log_cp10k" - // // } - // | joinStates { ids, states -> - // // store the dataset metadata in a file - // // def dataset_uns = states.collect{state -> - // // def uns = state.dataset_uns.clone() - // // uns.remove("normalization_id") - // // uns - // // } - // def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - // def dataset_uns_file = tempFile("dataset_uns.yaml") - // dataset_uns_file.write(dataset_uns_yaml_blob) - - // // ["output", [output_dataset_info: dataset_uns_file]] - // } - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def viash_file = meta.resources_dir.resolve("_viash.yaml") - def viash_file_content = toYamlBlob(readYaml(viash_file).info) - def task_info_file = tempFile("task_info.yaml") - task_info_file.write(viash_file_content) - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - def new_state = [ - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - // | mix(dataset_meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} - -// temp fix for rename_keys typo - -def findStatesTemp(Map params, Map config) { - def auto_config = deepClone(config) - def auto_params = deepClone(params) - - auto_config = auto_config.clone() - // override arguments - auto_config.argument_groups = [] - auto_config.arguments = [ - [ - type: "string", - name: "--id", - description: "A dummy identifier", - required: false - ], - [ - type: "file", - name: "--input_states", - example: "/path/to/input/directory/**/state.yaml", - description: "Path to input directory containing the datasets to be integrated.", - required: true, - multiple: true, - multiple_sep: ";" - ], - [ - type: "string", - name: "--filter", - example: "foo/.*/state.yaml", - description: "Regex to filter state files by path.", - required: false - ], - // to do: make this a yaml blob? - [ - type: "string", - name: "--rename_keys", - example: ["newKey1:oldKey1", "newKey2:oldKey2"], - description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", - required: false, - multiple: true, - multiple_sep: ";" - ], - [ - type: "string", - name: "--settings", - example: '{"output_dataset": "dataset.h5ad", "k": 10}', - description: "Global arguments as a JSON glob to be passed to all components.", - required: false - ] - ] - if (!(auto_params.containsKey("id"))) { - auto_params["id"] = "auto" - } - - // run auto config through processConfig once more - auto_config = processConfig(auto_config) - - workflow findStatesTempWf { - helpMessage(auto_config) - - output_ch = - channelFromParams(auto_params, auto_config) - | flatMap { autoId, args -> - - def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] - - // look for state files in input dir - def stateFiles = args.input_states - - // filter state files by regex - if (args.filter) { - stateFiles = stateFiles.findAll{ stateFile -> - def stateFileStr = stateFile.toString() - def matcher = stateFileStr =~ args.filter - matcher.matches()} - } - - // read in states - def states = stateFiles.collect { stateFile -> - def state_ = readTaggedYaml(stateFile) - [state_.id, state_] - } - - // construct renameMap - if (args.rename_keys) { - def renameMap = args.rename_keys.collectEntries{renameString -> - def split = renameString.split(":") - assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" - split - } - - // rename keys in state, only let states through which have all keys - // also add global settings - states = states.collectMany{id, state -> - def newState = [:] - - for (key in renameMap.keySet()) { - def origKey = renameMap[key] - if (!(state.containsKey(origKey))) { - return [] - } - newState[key] = state[origKey] - } - - [[id, globalSettings + newState]] - } - } - - states - } - emit: - output_ch - } - - return findStatesTempWf -} \ No newline at end of file diff --git a/src/workflows/run_benchmark_single_omics/__main.nf b/src/workflows/run_benchmark_single_omics/__main.nf deleted file mode 100644 index 1240d344f..000000000 --- a/src/workflows/run_benchmark_single_omics/__main.nf +++ /dev/null @@ -1,246 +0,0 @@ -// construct list of methods -methods = [ - portia -] - -// construct list of metrics -metrics = [ - regression_1 -] - -// helper workflow for starting a workflow based on lists of yaml files -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -// benchmarking workflow -workflow run_wf { - take: - input_ch - - main: - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = input_ch - - | run_benchmark_fun( - methods: methods, - metrics: metrics, - methodFromState: { id, state, comp -> - def new_args = [ - multiomics_rna: state.multiomics_rna, - tf_all: state.tf_all, - prediction: 'predictions/$id.$key.output.h5ad', - output_model: null - ] - if (comp.config.functionality.info.type == "control_method") { - new_args.de_test_h5ad = state.de_test_h5ad - } - new_args - }, - methodToState: ["prediction": "prediction"], - metricFromState: [ - perturbation_data: "perturbation_data", - prediction: "prediction", - subsample: "subsample", - reg_type: "reg_type", - max_workers: "max_workers", - consensus: "consensus", - tf_all: "tf_all" - ], - metricToState: ["metric_output": "output"], - methodAuto: [publish: "state"] - ) - | joinStates { ids, states -> - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - ["output", [scores: score_uns_file]] - } - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // create dataset, method and metric metadata files - metadata_ch = input_ch - | create_metadata_files( - datasetFromState: [input: "multiomics_rna"], - methods: methods, - metrics: metrics, - meta: meta - ) - - // merge all of the output data - output_ch = score_ch - | mix(metadata_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} - - - - - -def run_benchmark_fun(args) { - // required args - def methods_ = args.methods - def metrics_ = args.metrics - def methodFromState = args.methodFromState - def methodToState = args.methodToState - def metricFromState = args.metricFromState - def metricToState = args.metricToState - - assert methods_, "methods must be defined" - assert metrics_, "metrics must be defined" - assert methodFromState, "methodFromState must be defined" - assert methodToState, "methodToState must be defined" - assert metricFromState, "metricFromState must be defined" - assert metricToState, "metricToState must be defined" - - // optional args - def keyPrefix = args.keyPrefix ?: "" - def methodAuto = args.methodAuto ?: [:] - def metricAuto = args.metricAuto ?: [:] - - // add the key prefix to the method and metric names - if (keyPrefix && keyPrefix != "") { - methods_ = methods.collect{ method -> - method.run(key: keyPrefix + method.config.functionality.name) - } - metrics_ = metrics.collect{ metric -> - metric.run(key: keyPrefix + metric.config.functionality.name) - } - } - - workflow bench { - take: input_ch - - main: - output_ch = input_ch - // run all methods - | runEach( - components: methods_, - filter: { id, state, comp -> - !state.method_ids || state.method_ids.contains(comp.config.functionality.name) - }, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - fromState: methodFromState, - toState: methodToState, - auto: methodAuto - ) - - // run all metrics - | runEach( - components: metrics_, - filter: { id, state, comp -> - !state.metric_ids || state.metric_ids.contains(comp.config.functionality.name) - }, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - fromState: metricFromState, - toState: metricToState, - auto: metricAuto - ) - - // extract the scores - | extract_metadata.run( - key: "${keyPrefix}score_uns", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - emit: output_ch - } - return bench -} - - -def create_metadata_files(args) { - // required args - def meta_ = args.meta - def methods_ = args.methods - def metrics_ = args.metrics - def datasetFromState = args.datasetFromState - - assert meta_, "meta must be defined" - assert methods_, "methods must be defined" - assert metrics_, "metrics must be defined" - assert datasetFromState, "datasetFromState must be defined" - - workflow metadata { - take: input_ch - - main: - output_ch = input_ch - - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - | extract_metadata.run( - key: "dataset_uns", - fromState: args.datasetFromState, - toState: { id, output, state -> - state + [ - dataset_info: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - assert states.size() > 0, "no states found" - assert states[0]._meta, "no _meta found in state[0]" - assert states.every{it.dataset_info}, "not all states have dataset_info" - - // combine the dataset info into one file - def dataset_uns = states.collect{it.dataset_info} - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - // store the method configs in a file - def method_configs = methods_.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics_.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def task_info_file = meta_.resources_dir.resolve("task_info.yaml") - - def new_state = [ - dataset_uns: dataset_uns_file, - method_configs: method_configs_file, - metric_configs: metric_configs_file, - task_info: task_info_file, - _meta: states[0]._meta - ] - ["output", new_state] - } - emit: output_ch - } - return metadata -} diff --git a/src/workflows/run_benchmark_single_omics/_main.nf b/src/workflows/run_benchmark_single_omics/_main.nf deleted file mode 100644 index d3eb8ee16..000000000 --- a/src/workflows/run_benchmark_single_omics/_main.nf +++ /dev/null @@ -1,129 +0,0 @@ - -workflow auto { - findStatesTemp(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of metrics - metrics = [ - regression_1 - ] - - /*************************** - * RUN METRICS * - ***************************/ - score_ch = input_ch - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - | positive_control.run( - runIf: { id, state -> - state.method_id == 'positive_control' - }, - fromState: [ - perturbation_data: "perturbation_data", - layer: "layer", - tf_all: "tf_all" - ], - toState: {id, output, state -> - state + [ - prediction: output.prediction - ] - } - ) - | negative_control.run( - runIf: { id, state -> - state.method_id == 'negative_control' - }, - fromState: [ - perturbation_data: "perturbation_data" - ], - toState: {id, output, state -> - state + [ - prediction: output.prediction - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - perturbation_data: "perturbation_data", - layer: "layer", - prediction: "prediction", - subsample: "subsample", - reg_type: "reg_type", - method_id: "method_id", - max_workers: "max_workers", - consensus: "consensus" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.functionality.name, - metric_output: output.score - ] - } - ) - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - assert states[0]._meta, "no _meta found in state[0]" - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def task_info_file = meta.resources_dir.resolve("task_info.yaml") - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - def new_state = [ - metric_configs: metric_configs_file, - scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} diff --git a/src/workflows/run_benchmark_single_omics/config.vsh.yaml b/src/workflows/run_benchmark_single_omics/config.vsh.yaml index aa6261341..8821ad9fb 100644 --- a/src/workflows/run_benchmark_single_omics/config.vsh.yaml +++ b/src/workflows/run_benchmark_single_omics/config.vsh.yaml @@ -93,6 +93,7 @@ functionality: - name: grn_methods/scsgl - name: grn_methods/tigress - name: grn_methods/scgpt + - name: grn_methods/scenic repositories: - name: openproblems type: github