workflows updated

openproblems-bio · Sep 10, 2024 · a8da63f · a8da63f
1 parent a63a2eb
commit a8da63f
Show file tree

Hide file tree

Showing 15 changed files with 251 additions and 127 deletions.
diff --git a/runs.ipynb b/runs.ipynb
@@ -2184,6 +2184,25 @@
     "!aws s3 sync s3://openproblems-data/resources/grn/results/single_omics_all resources/results/single_omics_all"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "download: s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge/scores.yaml to resources/results/grn_evaluation_so_ridge/scores.yaml\n",
+      "download: s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge/trace.txt to resources/results/grn_evaluation_so_ridge/trace.txt\n",
+      "download: s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge/metric_configs.yaml to resources/results/grn_evaluation_so_ridge/metric_configs.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "!aws s3 sync s3://openproblems-data/resources/grn/results/grn_evaluation_so_ridge resources/results/grn_evaluation_so_ridge"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
@@ -1,23 +1,33 @@
 #!/bin/bash
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-reg_type=${1} #GB, ridge
+# reg_type=${1} #GB, ridge
+reg_type=ridge
 
-RUN_ID="grn_evaluation_so_${reg_type}"
+RUN_ID="grn_evaluation_so_all_${reg_type}"
 # resources_dir="s3://openproblems-data/resources/grn"
 resources_dir="./resources"
 publish_dir="${resources_dir}/results/${RUN_ID}"
 grn_models_folder="${resources_dir}/grn_models"
 
 subsample=-2
 max_workers=10
-layer=pearson
-metric_ids="[regression_1]"
+layer=scgen_pearson
+metric_ids="[regression_1, regression_2]"
 
 param_file="./params/${RUN_ID}.yaml"
 
 grn_names=(
+    "scglue"
+    "scenicplus"
+    "celloracle"
+    "granie"
+    "figr"
+    "collectri"
     "genie3"
+    "grnboost2"
+    "ppcor"
+    "portia"
     )
 # Start writing to the YAML file
 cat > $param_file << HERE
@@ -26,47 +36,45 @@ HERE
 
 append_entry() {
   cat >> $param_file << HERE
-  - id: ${reg_type}_${1}_${3}
+  - id: ${reg_type}_${1}
     metric_ids: ${metric_ids}
     perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
     reg_type: $reg_type
     method_id: $1
     subsample: $subsample
     max_workers: $max_workers
     tf_all: ${resources_dir}/prior/tf_all.csv
-    layer: ${3}
+    layer: ${layer}
     consensus: ${resources_dir}/prior/consensus-num-regulators.json
-
-HERE
-
-  # Conditionally append the prediction line if the second argument is "true"
-  if [[ $2 == "true" ]]; then
-    cat >> $param_file << HERE
     prediction: ${grn_models_folder}/$1.csv
 HERE
-  fi
 }
 
-# Loop through grn_names and layers
-
-for grn_name in "${grn_names[@]}"; do
-  append_entry "$grn_name" "true" "$layer"
-done
-
-
-# # Append negative control
-# grn_name="negative_control"
-# for layer in "${layers[@]}"; do
-#   append_entry "$grn_name" "false" "$layer"
-# done
-
-
-# # Append positive controls
-# grn_name="positive_control"
-# for layer in "${layers[@]}"; do
-#   append_entry "$grn_name" "false" "$layer"
+# #Loop through grn_names and layers
+# for grn_name in "${grn_names[@]}"; do
+#   append_entry "$grn_name" 
 # done
 
+append_entry_control() {
+  cat >> $param_file << HERE
+  - id: ${reg_type}_${1}
+    metric_ids: ${metric_ids}
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    reg_type: $reg_type
+    method_id: $1
+    subsample: $subsample
+    max_workers: $max_workers
+    tf_all: ${resources_dir}/prior/tf_all.csv
+    layer: ${layer}
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    causal: ${2}
+HERE
+}
+# controls
+# append_entry_control "negative_control" ""
+# append_entry_control "positive_control" ""
+append_entry_control "baseline_corr_causal" "True"
+append_entry_control "baseline_corr" "False"
 
 # Append the remaining output_state and publish_dir to the YAML file
 cat >> $param_file << HERE
@@ -88,7 +96,7 @@ nextflow run . \
 #     --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
 #     --workspace 53907369739130 `
 #     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
-#     --params-file ./params/scgen_pearson_gb_pcs.yaml `
+#     --params-file ./params/grn_evaluation_so_ridge.yaml `
 #     --config src/common/nextflow_helpers/labels_tw.config
 
 
diff --git a/scripts/run_robust_analys_causal.sh b/scripts/run_robust_analys_causal.sh
@@ -1,55 +1,85 @@
 #!/bin/bash
 # viash ns build --parallel
-RUN_ID="robust_analy_causal" 
-# resources_dir="resources"
-resources_dir="s3://openproblems-data/resources/grn"
-
+RUN_ID="robust_analy_causal_1" 
+resources_dir="resources"
+# resources_dir="s3://openproblems-data/resources/grn"
 publish_dir="${resources_dir}/results/${RUN_ID}"
 
-
-
 reg_type=ridge
 subsample=-2
 max_workers=10
-
-params_list_file="params/list_${RUN_ID}.yaml"
+layer=(scgen_pearson)
+metric_ids="[regression_1]"
 
 param_file="./params/${RUN_ID}.yaml"
+cat >> $param_file << HERE
+param_list:
+HERE
+
+# add causal corr
+cat >> $param_file << HERE
+  - id: corr-causal
+    metric_ids: ${metric_ids}
+    multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    reg_type: $reg_type
+    method_id: baseline_corr_causal
+    layer: ${layer}
+    subsample: $subsample
+    max_workers: $max_workers
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    tf_all: ${resources_dir}/prior/tf_all.csv
+    causal: True
+HERE
 
 append_entry() {
-  cat >> $params_list_file << HERE
+  cat >> $param_file << HERE
   - id: corr-${1}
+    metric_ids: ${metric_ids}
     multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
     perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
     reg_type: $reg_type
-    method_id: corr-${1}
-    layer: ${2}
+    method_id: baseline_corr-${1}
+    layer: ${layer}
     subsample: $subsample
     max_workers: $max_workers
     consensus: ${resources_dir}/prior/consensus-num-regulators.json
     tf_all: ${resources_dir}/prior/tf_all.csv
+    causal: False
 HERE
 }
-# Loop through grn_names and layers
-layers=("pearson")  # Array containing the layer(s)
 
-for layer in "${layers[@]}"; do  # Iterate over each layer in the array
-    for iter in {1..10}; do  # Loop from 1 to 100 iterations
-        append_entry "$iter" "$layer"  # Execute the append_entry command
-    done
+
+for iter in {1..2}; do  # Loop from 1 to 100 iterations
+    append_entry "$iter"   # Execute the append_entry command
 done
 
-aws s3 sync params/ s3://openproblems-data/resources/grn/params
-# Append the remaining output_state and publish_dir to the YAML file
 cat >> $param_file << HERE
-param_list: "${resources_dir}/${params_list_file}"
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-# nextflow run . \
-#   -main-script  target/nextflow/workflows/run_robustness_analysis_causal/main.nf \
-#   -profile docker \
-#   -with-trace \
-#   -c src/common/nextflow_helpers/labels_ci.config \
-#   -params-file ${param_file}
+# params_list_file="params/list_${RUN_ID}.yaml"
+
+# param_file="./params/${RUN_ID}.yaml"
+
+
+# # Loop through grn_names and layers
+# layers=("pearson")  # Array containing the layer(s)
+
+
+
+# aws s3 sync params/ s3://openproblems-data/resources/grn/params
+# # Append the remaining output_state and publish_dir to the YAML file
+# cat >> $param_file << HERE
+# param_list: "${resources_dir}/${params_list_file}"
+# output_state: "state.yaml"
+# publish_dir: "$publish_dir"
+# HERE
+
+nextflow run . \
+  -main-script  target/nextflow/workflows/run_robustness_analysis_causal/main.nf \
+  -profile docker \
+  -with-trace \
+  -c src/common/nextflow_helpers/labels_ci.config \
+  -params-file ${param_file}
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
@@ -11,8 +11,14 @@ functionality:
   arguments:
     - name: --perturbation_data
       __merge__: file_perturbation_h5ad.yaml
-      required: true
+      required: false
+      direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad
+    - name: --multiomics_rna
+      __merge__: file_multiomics_rna_h5ad.yaml
+      required: false
       direction: input
+      default: resources/grn-benchmark/multiomics_rna.h5ad
     - name: --layer
       type: string
       direction: input
@@ -21,13 +27,14 @@ functionality:
       required: false
     - name: --prediction
       __merge__: file_prediction.yaml
-      required: true
+      required: false
       direction: output
     - name: --tf_all
       type: file
       required: false
       direction: input
       example: resources_test/prior/tf_all.csv
+      default: resources/prior/tf_all.csv
 
 
   test_resources:

diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
@@ -12,11 +12,13 @@ functionality:
       __merge__: file_multiomics_rna_h5ad.yaml
       required: false
       direction: input
+      default: resources/grn-benchmark/multiomics_rna.h5ad
     - name: --multiomics_atac
       __merge__: file_multiomics_atac_h5ad.yaml
       required: false
       direction: input
       must_exist: false
+      default: resources/grn-benchmark/multiomics_atac.h5ad
     - name: --prediction
       __merge__: file_prediction.yaml
       required: false

diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
@@ -52,6 +52,11 @@ functionality:
       type: boolean 
       required: false
       default: true
+    - name: --clip_scores
+      type: boolean 
+      required: false
+      default: true
+      description: clips the r2 scores for each gene to make them within [0, 1]
 
 
 

diff --git a/src/control_methods/baseline_corr/config.vsh.yaml b/src/control_methods/baseline_corr/config.vsh.yaml
@@ -0,0 +1,32 @@
+__merge__: ../../api/comp_control_method.yaml
+
+functionality:
+  name: baseline_corr
+  info:
+    label: baseline_corr
+    summary: "Baseline based on Pearson corr"
+
+  arguments:
+    - name: --causal
+      type: boolean 
+      direction: input
+      default: false
+    - name: --seed
+      type: integer
+      direction: input
+
+
+  resources:
+    - type: python_script
+      path: script.py
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages: [  ]
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu]
diff --git a/src/robustness_analysis/causal/script.py → src/control_methods/baseline_corr/script.py b/src/robustness_analysis/causal/script.py → src/control_methods/baseline_corr/script.py
@@ -6,14 +6,10 @@
 from tqdm import tqdm
 from sklearn.preprocessing import StandardScaler
 
-
 ## VIASH START
 par = {
-
 }
-
 ## VIASH END
-
 def create_corr_net(X: np.ndarray, groups: np.ndarray):
     grns = []
     for group in tqdm(np.unique(groups), desc="Processing groups"):
@@ -22,10 +18,10 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray):
         grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
         grns.append(grn)
     return np.mean(grns, axis=0)
-
-
 print('Read data')
 multiomics_rna = ad.read_h5ad(par["multiomics_rna"])
+# print('subsetting: remove this')
+# multiomics_rna = multiomics_rna[:5000, :5000]
 gene_names = multiomics_rna.var_names.to_numpy()
 tf_all = np.loadtxt(par['tf_all'], dtype=str)
 groups = multiomics_rna.obs.cell_type
@@ -39,14 +35,15 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray):
 print('Create corr net')
 net = create_corr_net(multiomics_rna.X, groups)
 net = pd.DataFrame(net, index=gene_names, columns=gene_names)
+
 if par['causal']:
-    net_corr = net[tf_all]
+    net = net[tf_all]
 else:
-    net_corr = net.sample(len(tf_all), axis=1)
-net_corr = net_corr.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
-net_corr.rename(columns={'index': 'target'}, inplace=True)
+    net = net.sample(len(tf_all), axis=1, random_state=par['seed'])
+
+net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
+net.rename(columns={'index': 'target'}, inplace=True)
 
 
 print('Output GRN')
-net_corr.to_csv(par['prediction'])
-
+net.to_csv(par['prediction'])