diff --git a/runs.ipynb b/runs.ipynb index 1b0aa2a5d..3be255f89 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -2184,6 +2184,25 @@ "!aws s3 sync s3://openproblems-data/resources/grn/results/single_omics_all resources/results/single_omics_all" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download: s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge/scores.yaml to resources/results/grn_evaluation_so_ridge/scores.yaml\n", + "download: s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge/trace.txt to resources/results/grn_evaluation_so_ridge/trace.txt\n", + "download: s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge/metric_configs.yaml to resources/results/grn_evaluation_so_ridge/metric_configs.yaml\n" + ] + } + ], + "source": [ + "!aws s3 sync s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge resources/results/grn_evaluation_so_ridge" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh index 5ee9d29e5..d8eb39743 100755 --- a/scripts/run_grn_evaluation.sh +++ b/scripts/run_grn_evaluation.sh @@ -1,9 +1,10 @@ #!/bin/bash # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -reg_type=${1} #GB, ridge +# reg_type=${1} #GB, ridge +reg_type=ridge -RUN_ID="grn_evaluation_so_${reg_type}" +RUN_ID="grn_evaluation_so_all_${reg_type}" # resources_dir="s3://openproblems-data/resources/grn" resources_dir="./resources" publish_dir="${resources_dir}/results/${RUN_ID}" @@ -11,13 +12,22 @@ grn_models_folder="${resources_dir}/grn_models" subsample=-2 max_workers=10 -layer=pearson -metric_ids="[regression_1]" +layer=scgen_pearson +metric_ids="[regression_1, regression_2]" param_file="./params/${RUN_ID}.yaml" grn_names=( + "scglue" + "scenicplus" + "celloracle" + "granie" + "figr" + "collectri" "genie3" + "grnboost2" + "ppcor" + "portia" ) # Start writing to the YAML file cat > $param_file << HERE @@ -26,7 +36,7 @@ HERE append_entry() { cat >> $param_file << HERE - - id: ${reg_type}_${1}_${3} + - id: ${reg_type}_${1} metric_ids: ${metric_ids} perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad reg_type: $reg_type @@ -34,39 +44,37 @@ append_entry() { subsample: $subsample max_workers: $max_workers tf_all: ${resources_dir}/prior/tf_all.csv - layer: ${3} + layer: ${layer} consensus: ${resources_dir}/prior/consensus-num-regulators.json - -HERE - - # Conditionally append the prediction line if the second argument is "true" - if [[ $2 == "true" ]]; then - cat >> $param_file << HERE prediction: ${grn_models_folder}/$1.csv HERE - fi } -# Loop through grn_names and layers - -for grn_name in "${grn_names[@]}"; do - append_entry "$grn_name" "true" "$layer" -done - - -# # Append negative control -# grn_name="negative_control" -# for layer in "${layers[@]}"; do -# append_entry "$grn_name" "false" "$layer" -# done - - -# # Append positive controls -# grn_name="positive_control" -# for layer in "${layers[@]}"; do -# append_entry "$grn_name" "false" "$layer" +# #Loop through grn_names and layers +# for grn_name in "${grn_names[@]}"; do +# append_entry "$grn_name" # done +append_entry_control() { + cat >> $param_file << HERE + - id: ${reg_type}_${1} + metric_ids: ${metric_ids} + perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad + reg_type: $reg_type + method_id: $1 + subsample: $subsample + max_workers: $max_workers + tf_all: ${resources_dir}/prior/tf_all.csv + layer: ${layer} + consensus: ${resources_dir}/prior/consensus-num-regulators.json + causal: ${2} +HERE +} +# controls +# append_entry_control "negative_control" "" +# append_entry_control "positive_control" "" +append_entry_control "baseline_corr_causal" "True" +append_entry_control "baseline_corr" "False" # Append the remaining output_state and publish_dir to the YAML file cat >> $param_file << HERE @@ -88,7 +96,7 @@ nextflow run . \ # --main-script target/nextflow/workflows/run_grn_evaluation/main.nf ` # --workspace 53907369739130 ` # --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` -# --params-file ./params/scgen_pearson_gb_pcs.yaml ` +# --params-file ./params/grn_evaluation_so_ridge.yaml ` # --config src/common/nextflow_helpers/labels_tw.config diff --git a/scripts/run_robust_analys_causal.sh b/scripts/run_robust_analys_causal.sh index 58183be39..7c9afbd25 100755 --- a/scripts/run_robust_analys_causal.sh +++ b/scripts/run_robust_analys_causal.sh @@ -1,55 +1,85 @@ #!/bin/bash # viash ns build --parallel -RUN_ID="robust_analy_causal" -# resources_dir="resources" -resources_dir="s3://openproblems-data/resources/grn" - +RUN_ID="robust_analy_causal_1" +resources_dir="resources" +# resources_dir="s3://openproblems-data/resources/grn" publish_dir="${resources_dir}/results/${RUN_ID}" - - reg_type=ridge subsample=-2 max_workers=10 - -params_list_file="params/list_${RUN_ID}.yaml" +layer=(scgen_pearson) +metric_ids="[regression_1]" param_file="./params/${RUN_ID}.yaml" +cat >> $param_file << HERE +param_list: +HERE + +# add causal corr +cat >> $param_file << HERE + - id: corr-causal + metric_ids: ${metric_ids} + multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad + perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad + reg_type: $reg_type + method_id: baseline_corr_causal + layer: ${layer} + subsample: $subsample + max_workers: $max_workers + consensus: ${resources_dir}/prior/consensus-num-regulators.json + tf_all: ${resources_dir}/prior/tf_all.csv + causal: True +HERE append_entry() { - cat >> $params_list_file << HERE + cat >> $param_file << HERE - id: corr-${1} + metric_ids: ${metric_ids} multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad reg_type: $reg_type - method_id: corr-${1} - layer: ${2} + method_id: baseline_corr-${1} + layer: ${layer} subsample: $subsample max_workers: $max_workers consensus: ${resources_dir}/prior/consensus-num-regulators.json tf_all: ${resources_dir}/prior/tf_all.csv + causal: False HERE } -# Loop through grn_names and layers -layers=("pearson") # Array containing the layer(s) -for layer in "${layers[@]}"; do # Iterate over each layer in the array - for iter in {1..10}; do # Loop from 1 to 100 iterations - append_entry "$iter" "$layer" # Execute the append_entry command - done + +for iter in {1..2}; do # Loop from 1 to 100 iterations + append_entry "$iter" # Execute the append_entry command done -aws s3 sync params/ s3://openproblems-data/resources/grn/params -# Append the remaining output_state and publish_dir to the YAML file cat >> $param_file << HERE -param_list: "${resources_dir}/${params_list_file}" output_state: "state.yaml" publish_dir: "$publish_dir" HERE -# nextflow run . \ -# -main-script target/nextflow/workflows/run_robustness_analysis_causal/main.nf \ -# -profile docker \ -# -with-trace \ -# -c src/common/nextflow_helpers/labels_ci.config \ -# -params-file ${param_file} +# params_list_file="params/list_${RUN_ID}.yaml" + +# param_file="./params/${RUN_ID}.yaml" + + +# # Loop through grn_names and layers +# layers=("pearson") # Array containing the layer(s) + + + +# aws s3 sync params/ s3://openproblems-data/resources/grn/params +# # Append the remaining output_state and publish_dir to the YAML file +# cat >> $param_file << HERE +# param_list: "${resources_dir}/${params_list_file}" +# output_state: "state.yaml" +# publish_dir: "$publish_dir" +# HERE + +nextflow run . \ + -main-script target/nextflow/workflows/run_robustness_analysis_causal/main.nf \ + -profile docker \ + -with-trace \ + -c src/common/nextflow_helpers/labels_ci.config \ + -params-file ${param_file} diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index f048d3ec5..bab694514 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -11,8 +11,14 @@ functionality: arguments: - name: --perturbation_data __merge__: file_perturbation_h5ad.yaml - required: true + required: false + direction: input + default: resources/grn-benchmark/perturbation_data.h5ad + - name: --multiomics_rna + __merge__: file_multiomics_rna_h5ad.yaml + required: false direction: input + default: resources/grn-benchmark/multiomics_rna.h5ad - name: --layer type: string direction: input @@ -21,13 +27,14 @@ functionality: required: false - name: --prediction __merge__: file_prediction.yaml - required: true + required: false direction: output - name: --tf_all type: file required: false direction: input example: resources_test/prior/tf_all.csv + default: resources/prior/tf_all.csv test_resources: diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index bda0e011f..56bf37867 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -12,11 +12,13 @@ functionality: __merge__: file_multiomics_rna_h5ad.yaml required: false direction: input + default: resources/grn-benchmark/multiomics_rna.h5ad - name: --multiomics_atac __merge__: file_multiomics_atac_h5ad.yaml required: false direction: input must_exist: false + default: resources/grn-benchmark/multiomics_atac.h5ad - name: --prediction __merge__: file_prediction.yaml required: false diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index bc8751586..beb9b046c 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -52,6 +52,11 @@ functionality: type: boolean required: false default: true + - name: --clip_scores + type: boolean + required: false + default: true + description: clips the r2 scores for each gene to make them within [0, 1] diff --git a/src/control_methods/baseline_corr/config.vsh.yaml b/src/control_methods/baseline_corr/config.vsh.yaml new file mode 100644 index 000000000..38ae38955 --- /dev/null +++ b/src/control_methods/baseline_corr/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_control_method.yaml + +functionality: + name: baseline_corr + info: + label: baseline_corr + summary: "Baseline based on Pearson corr" + + arguments: + - name: --causal + type: boolean + direction: input + default: false + - name: --seed + type: integer + direction: input + + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: [ ] + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/robustness_analysis/causal/script.py b/src/control_methods/baseline_corr/script.py similarity index 75% rename from src/robustness_analysis/causal/script.py rename to src/control_methods/baseline_corr/script.py index 08ac633e0..7163901d3 100644 --- a/src/robustness_analysis/causal/script.py +++ b/src/control_methods/baseline_corr/script.py @@ -6,14 +6,10 @@ from tqdm import tqdm from sklearn.preprocessing import StandardScaler - ## VIASH START par = { - } - ## VIASH END - def create_corr_net(X: np.ndarray, groups: np.ndarray): grns = [] for group in tqdm(np.unique(groups), desc="Processing groups"): @@ -22,10 +18,10 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray): grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0] grns.append(grn) return np.mean(grns, axis=0) - - print('Read data') multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) +# print('subsetting: remove this') +# multiomics_rna = multiomics_rna[:5000, :5000] gene_names = multiomics_rna.var_names.to_numpy() tf_all = np.loadtxt(par['tf_all'], dtype=str) groups = multiomics_rna.obs.cell_type @@ -39,14 +35,15 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray): print('Create corr net') net = create_corr_net(multiomics_rna.X, groups) net = pd.DataFrame(net, index=gene_names, columns=gene_names) + if par['causal']: - net_corr = net[tf_all] + net = net[tf_all] else: - net_corr = net.sample(len(tf_all), axis=1) -net_corr = net_corr.reset_index().melt(id_vars='index', var_name='source', value_name='weight') -net_corr.rename(columns={'index': 'target'}, inplace=True) + net = net.sample(len(tf_all), axis=1, random_state=par['seed']) + +net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight') +net.rename(columns={'index': 'target'}, inplace=True) print('Output GRN') -net_corr.to_csv(par['prediction']) - +net.to_csv(par['prediction']) diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml index 008d43f9e..1cd6c1be6 100644 --- a/src/metrics/regression_1/config.vsh.yaml +++ b/src/metrics/regression_1/config.vsh.yaml @@ -12,7 +12,7 @@ functionality: type: string direction: input required: false - default: pearson + default: scgen_pearson - name: --min_tf type: integer direction: input diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index ba5617d25..3b125b0c4 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -95,7 +95,6 @@ def cross_validate_gene( # results['r2'] = float(r2_score(y_target, y_pred)) results['avg-r2'] = float(np.mean(r2s)) - return results @@ -129,6 +128,7 @@ def learn_background_distribution( n_jobs=n_jobs ) scores.append(res['avg-r2']) + background[n_features] = (np.mean(scores), max(0.001, np.std(scores))) background['max'] = background[max_n_regulators] return background @@ -220,7 +220,8 @@ def static_approach( tf_names: Set[str], reg_type: str, n_jobs:int, - n_features_dict:dict + n_features_dict:dict, + clip_scores:bool ) -> float: # Cross-validate each gene using the inferred GRN to define select input features @@ -230,8 +231,10 @@ def static_approach( for i in range(len(res['scores'])): gene_name = res['gene_names'][i] if n_features[n_features_dict[gene_name]] != 0: - r2.append(res['scores'][i]['avg-r2']) - + score = res['scores'][i]['avg-r2'] + if clip_scores: + score = np.clip(score, 0, 1) + r2.append(score) # mean_r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))]) mean_r2_scores = float(np.mean(r2)) @@ -292,14 +295,15 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: tf_names = np.loadtxt(par['tf_all'], dtype=str) if par['apply_tf']==False: tf_names = gene_names + clip_scores = par['clip_scores'] # Evaluate GRN print(f'Compute metrics for layer: {layer}', flush=True) # print(f'Dynamic approach:', flush=True) print(f'Static approach (theta=0):', flush=True) - score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers'], n_features_dict=n_features_dict) + score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores) print(f'Static approach (theta=0.5):', flush=True) - score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers'], n_features_dict=n_features_dict) + score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores) # print(f'Static approach (theta=1):', flush=True) # score_static_max = static_approach(grn, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers']) # TODO: find a mathematically sound way to combine Z-scores and r2 scores @@ -309,7 +313,7 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: 'static-theta-0.5': [float(score_static_median)] # 'static-theta-1.0': [float(score_static_max)], } - print(f'Scores on {layer}: {results}') + # print(f'Scores on {layer}: {results}') # Add dynamic score if not par['static_only']: diff --git a/src/robustness_analysis/causal/config.vsh.yaml b/src/robustness_analysis/causal/config.vsh.yaml deleted file mode 100644 index 00d3d23b4..000000000 --- a/src/robustness_analysis/causal/config.vsh.yaml +++ /dev/null @@ -1,42 +0,0 @@ -functionality: - name: causal_grn - namespace: "robustness_analysis" - info: - label: causal_grn - summary: Adds noise to the GRNs - arguments: - - name: --multiomics_rna - type: file - direction: input - example: resources_test/grn-benchmark/multiomics_rna.h5ad - default: resources/grn-benchmark/multiomics_rna.h5ad - - - name: --tf_all - type: file - direction: input - example: resources_test/prior/tf_all.csv - default: resources/prior/tf_all.csv - - - name: --prediction - type: file - direction: output - example: resources_test/grn_models/collectri.csv - default: output/prediction.csv - - - name: --causal - type: boolean - direction: input - default: false - - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - packages: [] - - type: nextflow - directives: - label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/workflows/run_grn_evaluation/config.vsh.yaml b/src/workflows/run_grn_evaluation/config.vsh.yaml index ec7da7958..b17eff9d9 100644 --- a/src/workflows/run_grn_evaluation/config.vsh.yaml +++ b/src/workflows/run_grn_evaluation/config.vsh.yaml @@ -46,6 +46,10 @@ functionality: required: false direction: input default: resources/prior/consensus.json + - name: --causal + type: boolean + required: false + direction: input - name: Outputs arguments: @@ -69,6 +73,7 @@ functionality: multiple: true description: A list of metric ids to run. If not specified, all metric will be run. + resources: - type: nextflow_script path: main.nf @@ -82,6 +87,7 @@ functionality: - name: metrics/regression_1 - name: control_methods/positive_control - name: control_methods/negative_control + - name: control_methods/baseline_corr repositories: - name: openproblems type: github diff --git a/src/workflows/run_grn_evaluation/main.nf b/src/workflows/run_grn_evaluation/main.nf index 20d1cc7c5..f5467f789 100644 --- a/src/workflows/run_grn_evaluation/main.nf +++ b/src/workflows/run_grn_evaluation/main.nf @@ -41,6 +41,41 @@ workflow run_wf { ] } ) + + | baseline_corr.run( + runIf: { id, state -> + state.method_id == 'baseline_corr_causal' + }, + fromState: [ + multiomics_rna: "multiomics_rna", + layer: "layer", + tf_all: "tf_all", + causal: "causal", + ], + toState: {id, output, state -> + state + [ + prediction: output.prediction + ] + } + ) + + | baseline_corr.run( + runIf: { id, state -> + state.method_id == 'baseline_corr' + }, + fromState: [ + multiomics_rna: "multiomics_rna", + layer: "layer", + tf_all: "tf_all", + causal: "causal", + seed: "seed" + ], + toState: {id, output, state -> + state + [ + prediction: output.prediction + ] + } + ) | negative_control.run( runIf: { id, state -> state.method_id == 'negative_control' diff --git a/src/workflows/run_robustness_analysis_causal/config.vsh.yaml b/src/workflows/run_robustness_analysis_causal/config.vsh.yaml index e72991fe1..1641bcf4a 100644 --- a/src/workflows/run_robustness_analysis_causal/config.vsh.yaml +++ b/src/workflows/run_robustness_analysis_causal/config.vsh.yaml @@ -42,6 +42,14 @@ functionality: type: file required: false direction: input + - name: --causal + type: boolean + required: false + direction: input + - name: --seed + type: integer + required: false + direction: input - name: Outputs @@ -56,6 +64,12 @@ functionality: required: true direction: output default: metric_configs.yaml + - name: Arguments + arguments: + - name: "--metric_ids" + type: string + multiple: true + description: A list of metric ids to run. If not specified, all metric will be run. resources: - type: nextflow_script @@ -68,7 +82,7 @@ functionality: repository: openproblems - name: metrics/regression_1 - name: metrics/regression_2 - - name: robustness_analysis/causal_grn + - name: control_methods/baseline_corr repositories: - name: openproblems type: github diff --git a/src/workflows/run_robustness_analysis_causal/main.nf b/src/workflows/run_robustness_analysis_causal/main.nf index 6c32c942a..31d82ccb2 100644 --- a/src/workflows/run_robustness_analysis_causal/main.nf +++ b/src/workflows/run_robustness_analysis_causal/main.nf @@ -26,18 +26,25 @@ workflow run_wf { [id, state + ["_meta": [join_id: id]]] } - | causal_grn.run( + | baseline_corr.run( fromState: [ - multiomics_rna: "multiomics_rna", tf_all: "tf_all" + multiomics_rna: "multiomics_rna", + tf_all: "tf_all", + causal:"causal" ], - toState: [ - prediction: "prediction" + toState: {id, output, state -> + state + [ + prediction: output.prediction ] + } ) // run all metrics | runEach( components: metrics, + filter: { id, state, comp -> + !state.metric_ids || state.metric_ids.contains(comp.config.functionality.name) + }, id: { id, state, comp -> id + "." + comp.config.functionality.name },