subsamples of sm_name-cell_type

openproblems-bio · Aug 17, 2024 · 00c7c93 · 00c7c93
1 parent 238e004
commit 00c7c93
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 138 deletions.
diff --git a/scripts/_run_evaluation.sh b/scripts/_run_evaluation.sh
diff --git a/scripts/_run_evaluation_all.sh b/scripts/_run_evaluation_all.sh
diff --git a/scripts/run_grn_evaluation_tw.sh b/scripts/run_grn_evaluation_tw.sh
@@ -2,11 +2,13 @@
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
 
-RUN_ID="subsample_200_gb_reg2"
+RUN_ID="scgen_pearson_gb_sub549"
 resources_dir="s3://openproblems-data/resources/grn"
 publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
+# grn_models_folder="${resources_dir}/supplementary/grn_models_noised"
+grn_models_folder="${resources_dir}/grn_models"
 reg_type=GB
-subsample=200
+subsample=-1
 max_workers=20
 
 param_file="./params/${RUN_ID}.yaml"
@@ -21,7 +23,8 @@ grn_names=(
     "scglue"
 )
 
-layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm")
+# layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm")
+layers=( "scgen_pearson" )
 
 # Start writing to the YAML file
 cat > $param_file << HERE
@@ -39,7 +42,7 @@ append_entry() {
     max_workers: $max_workers
     consensus: ${resources_dir}/prior/consensus-num-regulators.json
     ${2:+tf_all: ${resources_dir}/prior/tf_all.csv}
-    ${3:+prediction: ${resources_dir}/grn_models/$1.csv}
+    ${3:+prediction: ${grn_models_folder}/$1.csv}
 HERE
 }
 # Loop through grn_names and layers
@@ -49,17 +52,17 @@ for grn_name in "${grn_names[@]}"; do
   done
 done
 
-# Append negative control
-grn_name="negative_control"
-for layer in "${layers[@]}"; do
-  append_entry "$grn_name" "" "true"
-done
+# # Append negative control
+# grn_name="negative_control"
+# for layer in "${layers[@]}"; do
+#   append_entry "$grn_name" "" "true"
+# done
 
-# Append positive controls
-grn_name="positive_control"
-for layer in "${layers[@]}"; do
-  append_entry "$grn_name" "true"
-done
+# # Append positive controls
+# grn_name="positive_control"
+# for layer in "${layers[@]}"; do
+#   append_entry "$grn_name" "true"
+# done
 
 # Append the remaining output_state and publish_dir to the YAML file
 cat >> $param_file << HERE
@@ -81,7 +84,7 @@ HERE
     --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
     --workspace 53907369739130 `
     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
-    --params-file ./params/subsample_200_gb_reg2.yaml `
+    --params-file ./params/scgen_pearson_gb_sub549.yaml `
     --config src/common/nextflow_helpers/labels_tw.config
 
 
diff --git a/src/methods/multi_omics/celloracle/config.vsh.yaml b/src/methods/multi_omics/celloracle/config.vsh.yaml
@@ -17,7 +17,7 @@ functionality:
     - name: --links
       type: file
       direction: output
-      default: output/celloracle/links.celloracle.links    
+      default: output/celloracle/links.celloracle.links 
   resources:
     - type: python_script
       path: script.py

diff --git a/src/methods/multi_omics/scglue/config.vsh.yaml b/src/methods/multi_omics/scglue/config.vsh.yaml
@@ -13,14 +13,12 @@ functionality:
   arguments:
     - name: --annotation_file
       type: file
-      example: resources/supplements/gencode.v45.annotation.gtf.gz
-      default: resources/supplements/gencode.v45.annotation.gtf.gz
+      default: resources/supplementary/gencode.v45.annotation.gtf.gz
       required: false
       direction: input
     - name: --motif_file
       type: file
-      example: resources/supplements/JASPAR2022-hg38.bed.gz
-      default: resources/supplements/JASPAR2022-hg38.bed.gz
+      default: resources/supplementary/JASPAR2022-hg38.bed.gz
       required: false
       direction: input
 

diff --git a/src/methods/multi_omics/scglue_ns/config.vsh.yaml b/src/methods/multi_omics/scglue_ns/config.vsh.yaml
@@ -26,4 +26,4 @@ functionality:
 platforms:
   - type: nextflow
     directives:
-      label: [ hightime, midmem, lowcpu ]
+      label: [ hightime, midmem, highcpu ]
diff --git a/src/metrics/regression_2/config.vsh.yaml b/src/metrics/regression_2/config.vsh.yaml
@@ -16,6 +16,7 @@ functionality:
       direction: input
       must_exist: true
       default: 'resources/prior/consensus-num-regulators.json'
+      example: 'resources_test/prior/consensus-num-regulators.json'
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_python:1.0.4

diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -216,6 +216,15 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
     subsample = par['subsample']
     if subsample != -1:
         perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]
+
+    if True: # one combination of cell_type, sm_name
+        sampled_obs = perturbation_data.obs.groupby(['sm_name', 'cell_type'], observed=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
+        obs = perturbation_data.obs
+        mask = []
+        for _, row in obs.iterrows():
+            mask.append((sampled_obs==row).all(axis=1).any())  
+        perturbation_data = perturbation_data[mask,:]
+
     gene_names = perturbation_data.var.index.to_numpy()
     n_genes = len(gene_names)
     groups = LabelEncoder().fit_transform(perturbation_data.obs.plate_name)

diff --git a/src/robustness_analysis/add_noise_grn.py b/src/robustness_analysis/add_noise_grn.py
@@ -0,0 +1,30 @@
+import os
+import pandas as pd
+import numpy as np
+
+layer = 'scgen_pearson'
+grn_folder = 'resources/grn_models'
+grn_folder_noised = 'resources/supplementary/grn_models_noised'
+noise_ratio = 0.2
+
+# Ensure the output folder exists
+os.makedirs(grn_folder_noised, exist_ok=True)
+
+# Loop through all files in the grn_folder
+for file_name in os.listdir(grn_folder):
+    if file_name.endswith('.csv'):
+        # Read the CSV file
+        file_path = os.path.join(grn_folder, file_name)
+        df = pd.read_csv(file_path)
+
+        # Add noise to the 'weight' column
+        if 'weight' in df.columns:
+            std_dev = df['weight'].std()
+            noise = np.random.normal(0, noise_ratio * std_dev, size=df['weight'].shape)
+            df['weight'] += noise
+
+        # Save the noised DataFrame to the new folder
+        noised_file_path = os.path.join(grn_folder_noised, file_name)
+        df.to_csv(noised_file_path, index=False)
+
+print("Noise added to all GRN models and saved successfully.")