scglue fixed

openproblems-bio · Sep 18, 2024 · da9c8c4 · da9c8c4
1 parent 51aba97
commit da9c8c4
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 165 deletions.
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
@@ -13,6 +13,11 @@ functionality:
       required: true
       direction: input
       example: resources_test/grn-benchmark/multiomics_rna.h5ad
+    - name: --multiomics_atac
+      __merge__: file_multiomics_atac_h5ad.yaml
+      required: false
+      direction: input
+      example: resources_test/grn-benchmark/multiomics_atac.h5ad
     - name: --prediction
       __merge__: file_prediction.yaml
       required: false

diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py
@@ -1,44 +1,47 @@
 import os
-# import sys
-# import yaml
-# import pickle
-# import tempfile
-# import contextlib
-# import hashlib
-# import shutil
-# import requests
-# import traceback
-# import subprocess
-# import gc
-# import gzip
-# import tarfile
-# from urllib.request import urlretrieve
-
-# import numpy as np
-# import scanpy as sc
-# import pandas as pd
-# import anndata
-# import pyranges as pr
-# from pycistarget.utils import region_names_to_coordinates
-# from scenicplus.wrappers.run_pycistarget import run_pycistarget
-# import polars
-# import scrublet as scr
-# from sklearn.manifold import TSNE
-# import pycisTopic.loom
-# from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
-# from pycisTopic.iterative_peak_calling import get_consensus_peaks
-# from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge
-# from pycisTopic.qc import get_barcodes_passing_qc_for_sample
-# from pycisTopic.lda_models import evaluate_models, run_cgs_models, run_cgs_models_mallet
-# from pycisTopic.topic_binarization import binarize_topics
-# from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
-# from pycisTopic.diff_features import impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features
-# from pycisTopic.utils import region_names_to_coordinates
-# from pycisTopic.gene_activity import get_gene_activity
-# from pycisTopic.loom import export_region_accessibility_to_loom, export_gene_activity_to_loom
-# from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap
-# import gzip
-# import shutil
+import sys
+import yaml
+import pickle
+import tempfile
+import contextlib
+import hashlib
+import shutil
+import requests
+import traceback
+import subprocess
+import gc
+import gzip
+import tarfile
+import shutil
+import numpy as np
+import scanpy as sc
+import pandas as pd
+import anndata
+import pyranges as pr
+import pysam
+
+
+from urllib.request import urlretrieve
+
+
+from pycistarget.utils import region_names_to_coordinates
+from scenicplus.wrappers.run_pycistarget import run_pycistarget
+import polars
+import scrublet as scr
+from sklearn.manifold import TSNE
+import pycisTopic.loom
+from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
+from pycisTopic.iterative_peak_calling import get_consensus_peaks
+from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge
+from pycisTopic.qc import get_barcodes_passing_qc_for_sample
+from pycisTopic.lda_models import evaluate_models, run_cgs_models, run_cgs_models_mallet
+from pycisTopic.topic_binarization import binarize_topics
+from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
+from pycisTopic.diff_features import impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features
+from pycisTopic.utils import region_names_to_coordinates
+from pycisTopic.gene_activity import get_gene_activity
+from pycisTopic.loom import export_region_accessibility_to_loom, export_gene_activity_to_loom
+from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap
 
 ## VIASH START
 par = {
@@ -52,36 +55,26 @@
   'cell_topic': 'output/scenicplus/cell_topic.csv',
 }
 ## VIASH END
-
-
 # Bug in pycistopic: Import is missing in pycisTopic.loom,
 # so TSNE must be dynamically added to the library's namespace.
-# setattr(pycisTopic.loom, 'TSNE', TSNE)
-# os.environ['MALLET_MEMORY'] = '200G'
+setattr(pycisTopic.loom, 'TSNE', TSNE)
+os.environ['MALLET_MEMORY'] = '200G'
 
-atac_dir = f"output/gulack/"
-print(atac_dir)
+atac_dir = f"{par['temp_dir']}/atac/"
 os.makedirs(atac_dir, exist_ok=True)
-os.makedirs(f'{atac_dir}/atac', exist_ok=True)
-raise ValueError('here')
 
-# Get list of samples (e.g., donors)
-print('Collect list of samples')
-adata_atac = anndata.read_h5ad(par['multiomics_atac'])
-unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories]
-unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories]
-del adata_atac
 
 def process_atac(par):
     print("---Run pre-process started ---", flush=True)
     # Create one individual ATAC-seq file per donor
     adata_atac = anndata.read_h5ad(par['multiomics_atac'])
+    unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories]
+    unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories]
     fragments_dict = {}
     for donor_id in unique_donor_ids:
         filepath = os.path.join(atac_dir, f'{donor_id}.tsv')
         if not os.path.exists(filepath):
             print(f'Create tsv file {filepath}')
-
             adata_atac.obs.cell_type = [s.replace(' ', '_') for s in adata_atac.obs.cell_type]
             adata_atac.obs.donor_id = [s.replace(' ', '_') for s in adata_atac.obs.donor_id]
             adata_atac_one_donor = adata_atac[adata_atac.obs.donor_id == donor_id]
@@ -102,31 +95,24 @@ def process_atac(par):
             }
             df = pd.DataFrame(d, copy=False)
             df.to_csv(filepath, sep='\t', index=False, header=False)
-
         # Compress tsv file
         compressed_filepath = filepath + '.gz'
         if not os.path.exists(compressed_filepath):
-
             print(f'Sort and compress tsv file {filepath}')
-
             # Sort file by genomic coordinates
             sorted_filepath = filepath + '.sorted.tsv'
             os.system(f'sort -k1,1 -k2,2n {filepath} > {sorted_filepath}')
-
             # Compression
-            # subprocess.run(['bgzip', sorted_filepath, '-o', compressed_filepath])
-            with open(sorted_filepath, 'rb') as f_in:
-                with gzip.open(compressed_filepath, 'wb') as f_out:
-                    shutil.copyfileobj(f_in, f_out)
-
+            subprocess.run(['bgzip', sorted_filepath, '-o', compressed_filepath])     
             print(f'File compressed and saved as {compressed_filepath}')
 
         fragments_dict[donor_id] = compressed_filepath
 
-        # Index file using tabix
-        if not os.path.exists(compressed_filepath + '.tbi'):
+        # # Index file using tabix
+        if not os.path.exists(compressed_filepath + '.tbi'): #pip install --user pytabix
             print(f'Index compressed file {compressed_filepath} using tabix')
             subprocess.run(['tabix', compressed_filepath, '-p', 'bed'])
+
 
     # Collect cell metadata
     print(f'Collect cell metadata')
@@ -175,7 +161,7 @@ def process_atac(par):
             bed_path=os.path.join(par['temp_dir'], 'consensus_peak_calling/pseudobulk_bed_files'),
             bigwig_path=os.path.join(par['temp_dir'], 'consensus_peak_calling/pseudobulk_bw_files'),
             path_to_fragments=fragments_dict,
-            n_cpu=10,
+            n_cpu=par['num_workers'],
             temp_dir=os.path.join(par['temp_dir'], 'consensus_peak_calling/tmp'),
             split_pattern='-',
         )
@@ -196,7 +182,7 @@ def process_atac(par):
         bed_paths=bed_paths,
         outdir=os.path.join(os.path.join(par['temp_dir'], 'consensus_peak_calling/MACS')),
         genome_size='hs',
-        n_cpu=10,
+        n_cpu=par['num_workers'],
         input_format='BEDPE',
         shift=73,
         ext_size=146,
@@ -244,7 +230,6 @@ def run_cistopic(par):
         return 
     print("---Run cistopic---", flush=True)
     if par['qc']:  # Whether to perform quality control
-
         # Compute QC metrics
         print('Perform QC')
         for donor_id in unique_donor_ids:
@@ -294,7 +279,7 @@ def run_cistopic(par):
                 path_to_blacklist=os.path.join(par['temp_dir'], 'hg38-blacklist.v2.bed'),
                 metrics=sample_metrics,
                 valid_bc=sample_id_to_barcodes_passing_filters[sample_id],
-                n_cpu=10,
+                n_cpu=par['num_workers'],
                 project=donor_id,
                 split_pattern='-'
             )
@@ -308,7 +293,7 @@ def run_cistopic(par):
                 path_to_fragments=fragments_dict[donor_id],
                 path_to_regions=os.path.join(par['temp_dir'], 'consensus_peak_calling/consensus_regions.bed'),
                 path_to_blacklist=os.path.join(par['temp_dir'], 'hg38-blacklist.v2.bed'),
-                n_cpu=10,
+                n_cpu=par['num_workers'],
                 project=donor_id,
                 split_pattern='-'
             )

diff --git a/...hods/multi_omics/scglue/config.novsh.yaml → ...ethods/multi_omics/scglue/config.vsh.yaml b/...hods/multi_omics/scglue/config.novsh.yaml → ...ethods/multi_omics/scglue/config.vsh.yaml
@@ -11,17 +11,22 @@ functionality:
       GRN inference using scglue. 
     documentation_url: https://scglue.readthedocs.io/
   arguments:
-    - name: --annotation_file
-      type: file
-      # default: resources/supplementary/gencode.v45.annotation.gtf.gz
-      example: resources_test/supplementary/gencode.v45.annotation.gtf.gz
+    - name: --top_n_targets
+      type: string
       required: false
       direction: input
-    - name: --motif_file
-      type: file
-      example: resources_test/supplementary/JASPAR2022-hg38.bed.gz
+      default: 100
+    - name: --rank_threshold
+      type: string
       required: false
       direction: input
+      default: 1500
+    - name: --nes_threshold
+      type: string
+      required: false
+      direction: input
+      default: 1.5
+
 
   resources:
     - type: python_script

diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py
@@ -155,22 +155,22 @@ def run_grn(par):
     np.savetxt(f"{par['temp_dir']}/tfs.txt", tfs, fmt="%s")
 
     #Construct the command 
-    # command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom", 
-    #            f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv", 
-    #            '--seed', '0', '--num_workers', f"{par['num_workers']}", 
-    #            '--cell_id_attribute', 'obs_id', '--gene_attribute', 'name']
-    # print('Run grn')
-    # result = subprocess.run(command,  check=True)
+    command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom", 
+               f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv", 
+               '--seed', '0', '--num_workers', f"{par['num_workers']}", 
+               '--cell_id_attribute', 'obs_id', '--gene_attribute', 'name']
+    print('Run grn')
+    result = subprocess.run(command,  check=True)
 
-    # print("Output:")
-    # print(result.stdout)
-    # print("Error:")
-    # print(result.stderr)
+    print("Output:")
+    print(result.stdout)
+    print("Error:")
+    print(result.stderr)
 
-    # if result.returncode == 0:
-    #     print("Command executed successfully")
-    # else:
-    #     print("Command failed with return code", result.returncode)
+    if result.returncode == 0:
+        print("Command executed successfully")
+    else:
+        print("Command failed with return code", result.returncode)
 
 
     print("Generate TF cis-regulatory ranking bridged by ATAC peaks", flush=True)
@@ -217,30 +217,8 @@ def run_grn(par):
     ).to_csv(f"{par['temp_dir']}/ctx_annotation.tsv", sep="\t", index=False)
 def prune_grn(par):
     # Construct the command 
-    #TODO: be sure that obs_id is in obs and name is in var
-    print("Run pscenic ctx", flush=True)
-    # command = [
-    #     "pyscenic", "ctx",
-    #     f"{par['temp_dir']}/draft_grn.csv",
-    #     f"{par['temp_dir']}/glue.genes_vs_tracks.rankings.feather",
-    #     f"{par['temp_dir']}/supp.genes_vs_tracks.rankings.feather",
-    #     "--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv",
-    #     "--expression_mtx_fname", f"{par['temp_dir']}/rna.loom",
-    #     "--output", f"{par['temp_dir']}/pruned_grn.csv",
-    #     "--rank_threshold", "10000",
-    #     "--auc_threshold", "0.1",
-    #     "--nes_threshold", "2",
-    #     "--mask_dropouts",
-    #     "--min_genes", "1",
-    #     "--num_workers", f"{par['num_workers']}",
-    #     "--cell_id_attribute", "obs_id",
-    #     "--gene_attribute", "name"
-    # ]
 
-    par['genes_vs_motifs_500'] = 'output/scenic/databases/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
-    par['genes_vs_motifs_10k'] =  'output/scenic/databases/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
-    par['motif_annotation'] = 'output/scenic/databases/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
-
+    print("Run pscenic ctx", flush=True)
     command = [
         "pyscenic", "ctx", 
         f"{par['temp_dir']}/draft_grn.csv",
@@ -249,14 +227,13 @@ def prune_grn(par):
         "--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv",
         "--expression_mtx_fname", f"{par['temp_dir']}/rna.loom",
         "--output", f"{par['temp_dir']}/pruned_grn.csv",
-        # "--top_n_targets", "100",
-        # "--rank_threshold", "1500",
+        "--top_n_targets", str(par['top_n_targets']),
+        "--rank_threshold", str(par['rank_threshold']),
         # "--auc_threshold", "0.1",
-        # "--nes_threshold", "0",
-        "--mask_dropouts",
+        "--nes_threshold", str(par['nes_threshold']), 
         "--min_genes", "1",
         "--num_workers", f"{par['num_workers']}",
-        "--cell_id_attribute", "obs_id",
+        "--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
         "--gene_attribute", "name"
     ]
 
@@ -272,22 +249,6 @@ def prune_grn(par):
     else:
         print("pyscenic ctx failed with return code", result.returncode)
 
-# def download_prior(par):
-#     # get gene annotation
-#     response = requests.get("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz")
-#     par['annotation_file'] = f"{par['temp_dir']}/gencode.v45.annotation.gtf.gz"
-#     if response.status_code == 200:
-#         # Write the content to a file
-#         with open(par['annotation_file'], 'wb') as file:
-#             file.write(response.content)
-#         print(f"File downloaded and saved as gencode.v45.annotation.gtf.gz")
-#     else:
-#         print(f"Failed to download the gencode.v45.annotation.gtf.gz. Status code: {response.status_code}")
-
-
-
-#     annotation_file
-
 
 def main(par):
     import torch
@@ -299,10 +260,10 @@ def main(par):
     rna = ad.read_h5ad(par['multiomics_rna'])
     atac = ad.read_h5ad(par['multiomics_atac'])
 
-    # print('Preprocess data', flush=True)
-    # preprocess(rna, atac, par)
-    # print('Train a model', flush=True)
-    # training(par)
+    print('Preprocess data', flush=True)
+    preprocess(rna, atac, par)
+    print('Train a model', flush=True)
+    training(par)
     run_grn(par)
     prune_grn(par)
     print('Curate predictions', flush=True)