From 94b67ec81dda470094f13abeec2235b3ad0c3745 Mon Sep 17 00:00:00 2001 From: jalil Date: Fri, 13 Sep 2024 19:07:55 +0200 Subject: [PATCH] single omics are staged for try2 --- runs.ipynb | 494 ++++++++++-------- scripts/run_baselines.sh | 34 +- scripts/run_benchmark_single_omics.sh | 12 +- scripts/run_grn_evaluation.sh | 39 +- src/control_methods/baseline_corr/script.py | 12 +- .../negative_control/script.py | 9 +- .../config.vsh.yaml | 17 +- .../run_benchmark_single_omics/main.nf | 4 - src/workflows/run_grn_evaluation/main.nf | 37 -- 9 files changed, 324 insertions(+), 334 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index 4b2c295c0..de735f597 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -2592,7 +2592,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 116, "metadata": {}, "outputs": [ { @@ -2611,381 +2611,421 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5Rank
collectri-0.100238-0.2111820.4893160.514896
ppcor0.0270290.0182070.2245140.526332negative_control0.0000000.0000000.0000000.00000016
celloracle0.1716870.2358110.4326170.536534baseline_pearson0.0079130.0620310.4377620.50389113
baseline_dotproduct_causal0.0723270.3928050.1716130.533977baseline_pearson_causal0.1686820.2701380.3935650.5308128
baseline_dotproduct_causal_metacell-0.5221640.2253920.4021200.525323baseline_pearson_causal_celltype0.1365890.2236400.5469900.5540807
scglue0.2456700.2899340.8103890.599267baseline_pearson_causal_metacell0.0659610.1138440.4658540.52489811
scenicplus0.3018340.3924520.6980920.596971positive_control0.2318260.5053840.6319970.6255823
baseline_pearson0.0126800.0621780.3028220.512141collectri0.0000000.0000000.4893160.51489614
genie30.2001460.3354310.8271090.582196granie0.1085540.2091250.3567840.52600810
grnboost20.2645380.4264110.8303840.584299figr0.1540440.2202250.6807810.5657276
figr0.1540440.2202250.6807810.565727celloracle0.1716870.2358110.4326170.5365349
portia0.0137370.0332670.4918040.537863scglue0.2456700.2899340.8103890.5992674
granie0.1085540.2091250.3567840.526008scenicplus0.3018340.3924520.6980920.5969712
negative_control-0.014667-0.0147000.1826150.495925portia0.0137370.0332670.4918040.53786312
baseline_dotproduct_causal_cell_type0.0725500.3577570.1716130.531513ppcor0.0270290.0182070.2245140.52633215
baseline_corr_causal_spearman-0.100238-0.2111820.4893160.514896grnboost20.2645380.4264110.8303840.5842991
baseline_dotproduct0.0687190.3454210.4700810.527163genie30.2001460.3354310.8271090.5821965
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 113, + "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", - "models_all = ['negative_control', 'baseline_pearson', 'baseline_dotproduct', \n", - " 'baseline_pearson_causal', 'baseline_dotproduct_causal', 'baseline_dotproduct_causal_cell_type', \n", - " 'baseline_dotproduct_causal_metacell', 'positive_control', 'collectri','granie', 'figr', 'celloracle', \n", + "models_all = ['negative_control', 'baseline_pearson', \n", + " 'baseline_pearson_causal', 'baseline_pearson_causal_celltype', \n", + " 'baseline_pearson_causal_metacell', 'positive_control', 'collectri','granie', 'figr', 'celloracle', \n", " 'scglue', 'scenicplus', 'portia','ppcor', 'grnboost2', 'genie3']\n", - "models_all = ['negative_control', 'baseline_pearson', 'baseline_dotproduct', \n", - " 'baseline_dotproduct_causal', 'baseline_dotproduct_causal_cell_type', \n", - " 'baseline_dotproduct_causal_metacell', 'positive_control']\n", + "\n", "def extract_data(data, reg='reg1', dataset_id='scgen_pearson'):\n", " i = 0\n", " for entry in data:\n", @@ -3015,15 +3055,15 @@ "result_file = f'{base_folder}/scores.yaml'\n", "with open(result_file, 'r') as file:\n", " data = yaml.safe_load(file)\n", - "# df_reg1 = extract_data(data, reg='reg1').reindex(models_all).drop(columns=['Mean'])\n", - "# df_reg2 = extract_data(data, reg='reg2').reindex(models_all).drop(columns=['Mean'])\n", + "df_reg1 = extract_data(data, reg='reg1').reindex(models_all).drop(columns=['Mean'])\n", + "df_reg2 = extract_data(data, reg='reg2').reindex(models_all).drop(columns=['Mean'])\n", "\n", - "df_reg1 = extract_data(data, reg='reg1').drop(columns=['Mean'])\n", - "df_reg2 = extract_data(data, reg='reg2').drop(columns=['Mean'])\n", - "df_all = pd.concat([df_reg1, df_reg2], axis=1).fillna(0)\n", - "# df_all[df_all<0]=0\n", - "# df_all = (df_all-df_all.min(axis=0))/(df_all.max(axis=0)-df_all.min(axis=0))\n", - "# df_all['Mean'] = df_all.mean(axis=1)\n", + "# df_reg1 = extract_data(data, reg='reg1').drop(columns=['Mean'])\n", + "# df_reg2 = extract_data(data, reg='reg2').drop(columns=['Mean'])\n", + "df_all = pd.concat([df_reg1,df_reg2], axis=1).fillna(0)\n", + "df_all[df_all<0]=0\n", + "df_all_n = (df_all-df_all.min(axis=0))/(df_all.max(axis=0)-df_all.min(axis=0))\n", + "df_all['Rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", "df_all.style.background_gradient()" ] }, diff --git a/scripts/run_baselines.sh b/scripts/run_baselines.sh index 4d32eb58f..6108cc35e 100644 --- a/scripts/run_baselines.sh +++ b/scripts/run_baselines.sh @@ -1,3 +1,10 @@ +echo "negative control" +viash run src/control_methods/negative_control/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ + --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \ + --tf_all resources/prior/tf_all.csv \ + --prediction resources/grn_models/baselines/negative_control.csv + + echo "baseline pearson" viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ --tf_all resources/prior/tf_all.csv \ @@ -8,45 +15,36 @@ viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna --impute false \ --prediction resources/grn_models/baselines/baseline_pearson.csv -echo "baseline dotproduct" -viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ - --causal false \ - --corr_method dotproduct \ - --cell_type_specific false \ - --metacell false \ - --impute false \ - --prediction resources/grn_models/baselines/baseline_dotproduct.csv - -echo "baseline dotproduct causal" +echo "baseline pearson causal" viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ --tf_all resources/prior/tf_all.csv \ --causal true \ - --corr_method dotproduct \ + --corr_method pearson \ --cell_type_specific false \ --metacell false \ --impute false \ - --prediction resources/grn_models/baselines/baseline_dotproduct_causal.csv + --prediction resources/grn_models/baselines/baseline_pearson_causal.csv + echo "baseline causal cell type" viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ --tf_all resources/prior/tf_all.csv \ --causal true \ - --corr_method dotproduct \ + --corr_method pearson \ --cell_type_specific true \ --metacell false \ --impute false \ - --prediction resources/grn_models/baselines/baseline_dotproduct_causal_celltype.csv + --prediction resources/grn_models/baselines/baseline_pearson_causal_celltype.csv -echo "baseline dotproduct causal metacell" +echo "baseline pearson causal metacell" viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ --tf_all resources/prior/tf_all.csv \ --causal true \ - --corr_method dotproduct \ + --corr_method pearson \ --cell_type_specific false \ --metacell true \ --impute false \ - --prediction resources/grn_models/baselines/baseline_dotproduct_causal_metacell.csv + --prediction resources/grn_models/baselines/baseline_pearson_causal_metacell.csv echo "positive control" viash run src/control_methods/positive_control/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ diff --git a/scripts/run_benchmark_single_omics.sh b/scripts/run_benchmark_single_omics.sh index 6deedf95e..b13dbb62c 100644 --- a/scripts/run_benchmark_single_omics.sh +++ b/scripts/run_benchmark_single_omics.sh @@ -1,7 +1,7 @@ #!/bin/bash # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -RUN_ID="single_omics_try2" +RUN_ID="single_omics_inference" # resources_dir="./resources_test/" resources_dir="s3://openproblems-data/resources/grn" publish_dir="${resources_dir}/results/${RUN_ID}" @@ -10,9 +10,10 @@ publish_dir="${resources_dir}/results/${RUN_ID}" reg_type=ridge subsample=-2 max_workers=10 -layer='pearson' -metric_ids="[regression_1, regression_2]" +layer='scgen_pearson' +metric_ids="[regression_1]" method_ids="[tigress, ennet, scsgl, pidc]" +# method_ids="[portia]" param_file="./params/${RUN_ID}.yaml" @@ -30,9 +31,6 @@ param_list: layer: $layer consensus: ${resources_dir}/prior/consensus-num-regulators.json tf_all: ${resources_dir}/prior/tf_all.csv - model_file: ${resources_dir}/supplementary/finetuned_scGPT_adamson/best_model.pt - model_config_file: ${resources_dir}/supplementary/finetuned_scGPT_adamson/args.json - vocab_file: ${resources_dir}/supplementary/finetuned_scGPT_adamson/vocab.json output_state: "state.yaml" publish_dir: "$publish_dir" @@ -52,7 +50,7 @@ HERE # --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf ` # --workspace 53907369739130 ` # --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` -# --params-file ./params/single_omics_try2.yaml ` +# --params-file ./params/single_omics_inference.yaml ` # --config src/common/nextflow_helpers/labels_tw.config diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh index b63d20122..20e1607d6 100644 --- a/scripts/run_grn_evaluation.sh +++ b/scripts/run_grn_evaluation.sh @@ -2,19 +2,19 @@ # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" # reg_type=${1} #GB, ridge -viash ns build --parallel +# viash ns build --parallel reg_type=ridge RUN_ID="grn_evaluation_all_${reg_type}" -# resources_dir="s3://openproblems-data/resources/grn" -resources_dir="./resources" +resources_dir="s3://openproblems-data/resources/grn" +# resources_dir="./resources" publish_dir="${resources_dir}/results/${RUN_ID}" grn_models_folder="${resources_dir}/grn_models" subsample=-2 max_workers=10 layer=scgen_pearson -metric_ids="[regression_1]" +metric_ids="[regression_1, regression_2]" param_file="./params/${RUN_ID}.yaml" @@ -33,10 +33,9 @@ grn_names=( baseline_models=( baseline_pearson - baseline_dotproduct - baseline_dotproduct_causal - baseline_dotproduct_causal_celltype - baseline_dotproduct_causal_metacell + baseline_pearson_causal + baseline_pearson_causal_celltype + baseline_pearson_causal_metacell positive_control ) # Start writing to the YAML file @@ -62,11 +61,11 @@ HERE } -# folder=${grn_models_folder} -# # Loop through grn_names and layers -# for grn_name in "${grn_names[@]}"; do -# append_entry "$grn_name" "$folder" -# done +folder=${grn_models_folder} +# Loop through grn_names and layers +for grn_name in "${grn_names[@]}"; do + append_entry "$grn_name" "$folder" +done folder=${grn_models_folder}/baselines for grn_name in "${baseline_models[@]}"; do @@ -80,13 +79,13 @@ output_state: "state.yaml" publish_dir: "$publish_dir" HERE -nextflow run . \ - -main-script target/nextflow/workflows/run_grn_evaluation/main.nf \ - -profile docker \ - -with-trace \ - -c src/common/nextflow_helpers/labels_ci.config \ - -params-file ${param_file} -subl resources/results/grn_evaluation_all_ridge/scores.yaml +# nextflow run . \ +# -main-script target/nextflow/workflows/run_grn_evaluation/main.nf \ +# -profile docker \ +# -with-trace \ +# -c src/common/nextflow_helpers/labels_ci.config \ +# -params-file ${param_file} +# subl resources/results/grn_evaluation_all_ridge/scores.yaml # ./tw-windows-x86_64.exe launch ` # https://github.com/openproblems-bio/task_grn_inference.git ` diff --git a/src/control_methods/baseline_corr/script.py b/src/control_methods/baseline_corr/script.py index 50b332806..258d04924 100644 --- a/src/control_methods/baseline_corr/script.py +++ b/src/control_methods/baseline_corr/script.py @@ -25,13 +25,13 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray, method="pearson"): i = 0 for group in tqdm(np.unique(groups), desc="Processing groups"): X_sub = X[groups == group, :] - if method == "dotproduct": + if method == "pearson": X_sub = StandardScaler().fit_transform(X_sub) net = np.dot(X_sub.T, X_sub) / X_sub.shape[0] - elif method == "pearson": - net = np.corrcoef(X_sub.T) - # net = pd.DataFrame(X_sub).transpose().corr().values.to_numpy() - net = np.nan_to_num(net, nan=0.0, posinf=0.0, neginf=0.0) + # elif method == "pearson": + # net = np.corrcoef(X_sub.T) + # # net = pd.DataFrame(X_sub).transpose().corr().values.to_numpy() + # net = np.nan_to_num(net, nan=0.0, posinf=0.0, neginf=0.0) elif method == "spearman": net = spearmanr(X_sub).statistic @@ -61,7 +61,7 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray, method="pearson"): return grn print('Read data') multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) -multiomics_rna = multiomics_rna[:,:5000] #TODO: togo +# multiomics_rna = multiomics_rna[:,:5000] #TODO: togo if par['metacell']: print('metacell') diff --git a/src/control_methods/negative_control/script.py b/src/control_methods/negative_control/script.py index 15eac7ef3..fce2276d6 100644 --- a/src/control_methods/negative_control/script.py +++ b/src/control_methods/negative_control/script.py @@ -12,12 +12,18 @@ ## VIASH END print(par) +def process_links(net, par): + net = net[net.source!=net.target] + net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) + net = net_sorted.head(par['max_n_links']).reset_index(drop=True) + return net + print('Reading input data') perturbation_data = ad.read_h5ad(par["perturbation_data"]) gene_names = perturbation_data.var_names.to_numpy() tf_all = np.loadtxt(par['tf_all'], dtype=str) -n_tf = 400 +n_tf = 1200 tfs = tf_all[:n_tf] def create_negative_control(gene_names) -> np.ndarray: @@ -34,6 +40,7 @@ def create_negative_control(gene_names) -> np.ndarray: pivoted_net = pivoted_net.rename(columns={'index': 'target'}) pivoted_net = pivoted_net[pivoted_net['weight'] != 0] +pivoted_net = process_links(pivoted_net, par) print('Saving') pivoted_net.to_csv(par["prediction"]) diff --git a/src/workflows/run_benchmark_single_omics/config.vsh.yaml b/src/workflows/run_benchmark_single_omics/config.vsh.yaml index d31c17656..d31d923da 100644 --- a/src/workflows/run_benchmark_single_omics/config.vsh.yaml +++ b/src/workflows/run_benchmark_single_omics/config.vsh.yaml @@ -12,9 +12,9 @@ functionality: - name: --multiomics_rna type: file direction: input - - name: --multiomics_atac - type: file - direction: input + # - name: --multiomics_atac + # type: file + # direction: input - name: --perturbation_data type: file direction: input @@ -49,15 +49,6 @@ functionality: required: false direction: input default: pearson - - name: --model_file - type: file - direction: input - - name: --model_config_file - type: file - direction: input - - name: --vocab_file - type: file - direction: input - name: Outputs arguments: @@ -93,8 +84,6 @@ functionality: repository: openproblems - name: metrics/regression_2 - name: metrics/regression_1 - - name: control_methods/positive_control - - name: control_methods/negative_control - name: grn_methods/portia - name: grn_methods/ennet - name: grn_methods/genie3 diff --git a/src/workflows/run_benchmark_single_omics/main.nf b/src/workflows/run_benchmark_single_omics/main.nf index f66c1ae6c..702ad6709 100644 --- a/src/workflows/run_benchmark_single_omics/main.nf +++ b/src/workflows/run_benchmark_single_omics/main.nf @@ -76,11 +76,7 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ multiomics_rna: "multiomics_rna", - multiomics_atac: "multiomics_atac", tf_all: "tf_all", - model_file: "model_file", - model_config_file: "model_config_file", - vocab_file: "vocab_file" ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp -> diff --git a/src/workflows/run_grn_evaluation/main.nf b/src/workflows/run_grn_evaluation/main.nf index 928051386..0964bb183 100644 --- a/src/workflows/run_grn_evaluation/main.nf +++ b/src/workflows/run_grn_evaluation/main.nf @@ -42,43 +42,6 @@ workflow run_wf { // ] // } // ) - | baseline_corr.run( - runIf: { id, state -> - state.method_id == 'baseline_pearson' - }, - fromState: [ - multiomics_rna: "multiomics_rna", - layer: "layer", - tf_all: "tf_all", - causal: "causal", - corr_method: "corr_method", - cell_type_specific: "cell_type_specific", - metacell: "metacell", - impute: "impute" - - ], - toState: {id, output, state -> - state + [ - prediction: output.prediction - ] - } - ) - - // | negative_control.run( - // runIf: { id, state -> - // state.method_id == 'negative_control' - // }, - // fromState: [ - // perturbation_data: "perturbation_data", - // multiomics_rna: "multiomics_rna", - // tf_all: "tf_all", - // ], - // toState: {id, output, state -> - // state + [ - // prediction: output.prediction - // ] - // } - // ) // run all metrics | runEach(