small changes to single omics

openproblems-bio · Sep 30, 2024 · 8258d0a · 8258d0a
1 parent 14b161a
commit 8258d0a
Show file tree

Hide file tree

Showing 12 changed files with 640 additions and 163 deletions.
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/scripts/sbatch/calculate_scores.sh b/scripts/sbatch/calculate_scores.sh
@@ -8,4 +8,4 @@
 #SBATCH --mem=64G 
 #SBATCH --cpus-per-task=20  
 
-python src/metrics/script_all.py
+python src/metrics/script_all.py 
diff --git a/src/control_methods/script_all.py b/src/control_methods/script_all.py
@@ -6,7 +6,7 @@
 import random
 
 par = {
-  'write_dir': "resources/grn_models/d0_hvgs",
+  'models_dir': "resources/grn_models/d0_hvgs",
   "multiomics_rna": "resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad",
 
   "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
@@ -18,30 +18,55 @@
   'causal': True
 }
 
+import argparse
+parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
+parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file')
+parser.add_argument('--prediction', type=str, help='Path to the prediction file')
+parser.add_argument('--resources_dir', type=str, help='Path to the prediction file')
+parser.add_argument('--tf_all', type=str, help='Path to the tf_all')
+parser.add_argument('--num_workers', type=str, help='Number of cores')
+parser.add_argument('--models_dir', type=str, help='where to write the model')
+args = parser.parse_args()
+
+if args.multiomics_rna:
+    par['multiomics_rna'] = args.multiomics_rna
+if args.tf_all:
+    par['tf_all'] = args.tf_all
+if args.num_workers:
+    par['num_workers'] = args.num_workers
+if args.models_dir:
+    par['models_dir'] = args.models_dir
+
+if args.resources_dir:
+    meta = {'resources_dir': args.resources_dir} 
+
 meta = {
   "resources_dir": 'src/control_methods',
   "util": 'src/utils'
 }
 sys.path.append(meta["resources_dir"])
 sys.path.append(meta["util"])
 
-os.makedirs(par['write_dir'], exist_ok=True)
+os.makedirs(par['models_dir'], exist_ok=True)
 
 from util import create_corr_net
 
-
+print(par)
 #---- run for pearson_corr
-par['prediction'] = f"{par['write_dir']}/pearson_corr.csv"
+print("run for pearson_corr")
+par['prediction'] = f"{par['models_dir']}/pearson_corr.csv"
 par['causal'] = True
 net = create_corr_net(par)
 net.to_csv(par['prediction'])
 #---- run for negative control
+print("run for negative control")
 from negative_control.main import main 
-par['prediction'] = f"{par['write_dir']}/negative_control.csv"
+par['prediction'] = f"{par['models_dir']}/negative_control.csv"
 net = main(par)
 net.to_csv(par['prediction'])
 #---- run for positive_control
+print("run for positive control")
 par['multiomics_rna'] = par['perturbation_data']
-par['prediction'] = f"{par['write_dir']}/positive_control.csv"
+par['prediction'] = f"{par['models_dir']}/positive_control.csv"
 net = create_corr_net(par)
 net.to_csv(par['prediction'])
diff --git a/src/exp_analysis/helper.py b/src/exp_analysis/helper.py
@@ -65,6 +65,63 @@ def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs):
 #         self.out_deg = degree_centrality(net, source='source', target='target',  **kwargs)
 #         self.in_deg = degree_centrality(net, source='target', target='source',  **kwargs)
 
+class VSA_analysis:
+    '''Vester's sensitivity analysis. 
+                - active_sum: active sum. Sum along rows of the influence matrix and it indicates how much does a variable influence all the others.
+                - passive_sum: passive sum. Its is the sum along columns of the influence matrix and it indicates how sensitive a variable is, how does it react to the influence of others
+    '''
+    def __init__(self, sample: pd.DataFrame, control: pd.DataFrame, mode='weight', top_q_net = 0.75, critical_change_q_t: float = 0.75):
+        # - keep only top quantile links
+        control = control[control.weight > control.weight.quantile(top_q_net)]
+        sample = sample[sample.weight > sample.weight.quantile(top_q_net)]
+
+        print('Determine roles')
+        self.vsa_control = self.determine_roles(control)
+        self.vsa_sample = self.determine_roles(sample)
+        print('Determine role change')
+        self.oo = self.diff_roles(self.vsa_control, self.vsa_sample, critical_change_q_t)
+
+    @staticmethod
+    def determine_roles(net, mode='weight', top_q_role=0.9) -> pd.DataFrame:
+        # active sum and passive sum 
+        gene_names = np.union1d(net.source.unique(), net.target.unique())
+        if mode=='weight':
+            active_sum = np.asarray([sum(net.query(f"source == '{gene}'")['weight']) for gene in gene_names])
+            passive_sum = np.asarray([sum(net.query(f"target == '{gene}'")['weight']) for gene in gene_names])
+        else:
+            raise ValueError('define first')
+
+        # define the roles ['Buffering', 'Passive', 'Active', 'Critical'] -> [0, 1, 2, 3]
+        active_sum_threhsold = np.quantile(active_sum, top_q_role)
+        passive_sum_threhsold = np.quantile(passive_sum, top_q_role)
+        # active_sum, passive_sum = standardize_array(active_sum), standardize_array(passive_sum)
+        roles = [2*as_flag+ps_flag for as_flag, ps_flag in zip(active_sum>active_sum_threhsold, passive_sum>passive_sum_threhsold)]
+        roles = pd.DataFrame(data={'gene_name': gene_names, 'active_sum': active_sum, 'passive_sum': passive_sum, 'role':roles })
+        roles.set_index('gene_name',inplace=True)
+        return roles
+    @staticmethod
+    def diff_roles(control: pd.DataFrame, sample: pd.DataFrame, critical_change_q_t: float=.75) -> pd.DataFrame:
+        '''
+        Find the distance in the role from control to sample, and flags the top changes.
+        '''
+        # - match the index
+        common_genes = pd.Index(np.union1d(sample.index, control.index))
+        control = control.reindex(common_genes).fillna(0)
+        sample = sample.reindex(common_genes).fillna(0)
+        # - calculate distance 
+        as_distance = (sample['active_sum'] - control['active_sum'])
+        ps_distance = (sample['passive_sum'] - control['passive_sum'])
+        overall_distance = as_distance**2 + ps_distance**2
+        df_distance = pd.DataFrame({'as_distance':as_distance, 'ps_distance':ps_distance, 'overall_distance':overall_distance}, index=common_genes)
+
+        # df_distance = df_distance.assign(passive_sum_c=control['passive_sum'], active_sum_c=control['active_sum'])
+        # df_distance = df_distance.assign(passive_sum_s=sample['passive_sum'], active_sum_s=sample['active_sum'])
+
+        # - define x top percentile as the cut-off value to determine critical role change
+        # df_distance['critical_change_as'] = df_distance['as_distance'] > df_distance['as_distance'].quantile(critical_change_q_t)
+        # df_distance['critical_change_ps'] = df_distance['ps_distance'] > df_distance['ps_distance'].quantile(critical_change_q_t)
+        # df_distance['critical_change_overall'] = df_distance['overall_distance'] > df_distance['overall_distance'].quantile(critical_change_q_t)
+        return df_distance
 class Exp_analysis:
     '''
     This class provides functions for explanatory analysis of GRNs
@@ -101,8 +158,6 @@ def plot_centrality(self, df, title='',ax=None, xlabel='Degree', ylabel='Gene',
         if colors is not None:
             for label, color in zip(ax.get_yticklabels(), colors):
                 label.set_color(color)
-
-
     def top_edges(self, quantile=0.95):
         grn = self.net 
         grn = grn[grn.weight>grn.weight.quantile(quantile)]
@@ -118,7 +173,6 @@ def determine_source_properties(self):
         c_weight = determine_centrality(self.net, mode='weight')
 
         self.sources = c_weight.merge(c_degree, left_index=True, right_index=True)
-
     # def calculate_centrality_stats(self) -> None:
     #     '''Calculate network centrality metrics such as in and out degree
     #     '''
@@ -164,6 +218,9 @@ def annotate_genes(self, gene_annotation_df) -> dict[str, float]:
         # gene_annot.values
         self.gene_annot = {key:round((value*100/len(self.targets)), 1) for key, value in gene_annot.items()}
         return self.gene_annot
+    def analyse_roles(self, ref_net, **kwargs):
+        self.vsa_obj = VSA_analysis(self.net, ref_net,**kwargs)
+        self.vsa_results = self.vsa_obj.oo
 def plot_interactions(interaction_df: pd.DataFrame, min_subset_size=None, min_degree=None, color_map=None) -> plt.Figure: 
     """Upset plot of interactions
 

diff --git a/src/methods/single_omics/genie3/config.vsh.yaml b/src/methods/single_omics/genie3/config.vsh.yaml
@@ -15,6 +15,7 @@ functionality:
       path: script.py
     - path: /src/utils/util.py
       dest: util.py
+
 
 platforms:
   - type: docker

diff --git a/src/methods/single_omics/scenic/config.vsh.yaml b/src/methods/single_omics/scenic/config.vsh.yaml
@@ -27,6 +27,9 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - path: /src/utils/util.py
+      dest: util.py
+    # - path: main.py
 
 platforms:
   - type: docker

diff --git a/src/methods/single_omics/scenic/main.py b/src/methods/single_omics/scenic/main.py
diff --git a/src/methods/single_omics/scenic/script.py b/src/methods/single_omics/scenic/script.py
@@ -6,6 +6,9 @@
 import ast
 import requests
 import scipy.sparse as sp
+import sys
+
+
 
 # wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather
 
@@ -27,13 +30,6 @@
   'normalize': False
 }
 ## VIASH END
-
-import sys
-meta= {
-  "resources_dir": 'src/utils/'
-}
-
-
 import argparse
 parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
 parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file')
@@ -53,58 +49,45 @@
     par['num_workers'] = args.num_workers
 
 if args.resources_dir:
-    meta['resources_dir'] = args.resources_dir  
+    meta = {'resources_dir': args.resources_dir} 
 
 sys.path.append(meta["resources_dir"])
 from util import process_links
 
-os.makedirs(par['temp_dir'], exist_ok=True)
-
-databases = f"{par['temp_dir']}/databases/"
-os.makedirs(databases, exist_ok=True)
-
-par['motif_annotation'] = f'{databases}/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
-par['genes_vs_motifs_10k'] = f'{databases}/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
-par['genes_vs_motifs_500'] = f'{databases}/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
-
-if not (os.path.exists(par['motif_annotation'])):
-  print('downloading motif_annotation')
-  response = requests.get("https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl")
-  with open(par['motif_annotation'], "wb") as file:
-      file.write(response.content)
-if not (os.path.exists(par['genes_vs_motifs_10k'])):
-  print('downloading genes_vs_motifs_10k')
-  response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
-  with open(par['genes_vs_motifs_10k'], "wb") as file:
-      file.write(response.content)
-if not (os.path.exists(par['genes_vs_motifs_500'])):
-  print('downloading genes_vs_motifs_500')
-  response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
-  with open(par['genes_vs_motifs_500'], "wb") as file:
-      file.write(response.content)
-
-expr_mat_adjacencies =  os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv")
-expression_data = os.path.join(par['temp_dir'], "expression_data.tsv")
-regulons = f"{par['temp_dir']}/regulons.csv"
+def download_prior(par):
+  if not (os.path.exists(par['motif_annotation'])):
+    print('downloading motif_annotation')
+    response = requests.get("https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl")
+    with open(par['motif_annotation'], "wb") as file:
+        file.write(response.content)
+  if not (os.path.exists(par['genes_vs_motifs_10k'])):
+    print('downloading genes_vs_motifs_10k')
+    response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
+    with open(par['genes_vs_motifs_10k'], "wb") as file:
+        file.write(response.content)
+  if not (os.path.exists(par['genes_vs_motifs_500'])):
+    print('downloading genes_vs_motifs_500')
+    response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
+    with open(par['genes_vs_motifs_500'], "wb") as file:
+        file.write(response.content)
 
 def format_data(par):
   print('Read data')
   adata_rna = anndata.read_h5ad(par['multiomics_rna'])  
   gene_names = adata_rna.var_names
   if sp.issparse(adata_rna.X):
     adata_rna.X = adata_rna.X.toarray()
-  pd.DataFrame(adata_rna.X, columns=gene_names).to_csv(expression_data, sep='\t', index=False)
+  pd.DataFrame(adata_rna.X, columns=gene_names).to_csv(par['expression_data'], sep='\t', index=False)
 
-
 def run_grn(par):
   print('Run grn')
   command = [
       "pyscenic", "grn",
       "--num_workers", str(par['num_workers']),
       "--seed", str(par['seed']),
-      "-o", expr_mat_adjacencies,
+      "-o", str(par['expr_mat_adjacencies']),
       "--method", "grnboost2", 
-      expression_data,
+      str(par['expression_data']),
       par['tf_all']
   ]
   subprocess.run(command, check=True)
@@ -114,11 +97,11 @@ def prune_grn(par):
 
   command = [
       "pyscenic", "ctx",
-      expr_mat_adjacencies, par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'],
+      par['expr_mat_adjacencies'], par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'],
       "--annotations_fname", par['motif_annotation'], 
-      "--expression_mtx_fname", expression_data,
+      "--expression_mtx_fname", par['expression_data'],
       "--mode", "custom_multiprocessing",
-      "--output", regulons, 
+      "--output", str(par['regulons']), 
       "--num_workers", str(par['num_workers']),
       "--rank_threshold", str(par['rank_threshold']),
       "--auc_threshold", str(par['auc_threshold']),
@@ -129,7 +112,7 @@ def prune_grn(par):
 def format_grn(par):
 
   print('Format regulons')
-  regulons_df = pd.read_csv(regulons, index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:]
+  regulons_df = pd.read_csv(par['regulons'], index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:]
 
   def format_it(df):
       values = df['TargetGenes'].values[0]
@@ -142,13 +125,30 @@ def format_it(df):
   grn = regulons_df.groupby('index').apply(lambda df: format_it(df)).reset_index().rename(columns={'index':'source'})
   network = grn[['source','target','weight']]
   return network
+def main(par):
+  databases = f"{par['temp_dir']}/databases/"
+  os.makedirs(databases, exist_ok=True)
+
+  par['motif_annotation'] = f'{databases}/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
+  par['genes_vs_motifs_10k'] = f'{databases}/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
+  par['genes_vs_motifs_500'] = f'{databases}/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
+
+  par['expr_mat_adjacencies'] =  os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv")
+  par['expression_data'] = os.path.join(par['temp_dir'], "expression_data.tsv")
+  par['regulons'] = f"{par['temp_dir']}/regulons.csv"
+
+  # format_data(par)
+  # run_grn(par)
+  prune_grn(par)
+  network = format_grn(par)
+  return network
+
+if __name__=='__main__':
+  network = main(par)
+  os.makedirs(par['temp_dir'], exist_ok=True)
 
-format_data(par)
-run_grn(par)
-prune_grn(par)
-network = format_grn(par)
-network.to_csv(par['prediction'], sep=',')
+  network.to_csv(par['prediction'], sep=',')
+  print('Finished.')
 
-print('Finished.')
 
 
diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py
@@ -11,11 +11,21 @@
 ## VIASH START
 par = {
     'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
-    'grn_folder': 'resources/grn-benchmark/grn_models/d0_hvg',
-    'grns': 'pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle',
+    # 'models_dir': 'resources/grn-benchmark/grn_models/d0_hvg',
+    # 'models': [pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle],
     'output': 'resources/grn-benchmark/consensus-num-regulators.json'
 }
 ## VIASH END
+import argparse
+parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
+parser.add_argument('--models_dir', type=str, help='where to read the models')
+parser.add_argument('--models', nargs='+', help="List of models to process", required=True)
+
+args = parser.parse_args()
+if args.models_dir:
+    par['models_dir'] = args.models_dir
+if args.models:
+    par['models'] = args.models
 
 
 # Load perturbation data
@@ -26,12 +36,12 @@
     gene_names = adata_rna.var.index.to_numpy()
 
 print(par['perturbation_data'])
-print(par['grns'])
+print(par['models'])
 
 # Load inferred GRNs
 grns = []
-for filename in par['grns'].split(','):
-    filepath = os.path.join(par['grn_folder'], f'{filename}.csv')
+for filename in par['models']:
+    filepath = os.path.join(par['models_dir'], f'{filename}.csv')
     gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
     A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
     df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)