Skip to content

Commit

Permalink
scglue fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Sep 18, 2024
1 parent 51aba97 commit da9c8c4
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 165 deletions.
31 changes: 31 additions & 0 deletions runs.ipynb

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/api/comp_method.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ functionality:
required: true
direction: input
example: resources_test/grn-benchmark/multiomics_rna.h5ad
- name: --multiomics_atac
__merge__: file_multiomics_atac_h5ad.yaml
required: false
direction: input
example: resources_test/grn-benchmark/multiomics_atac.h5ad
- name: --prediction
__merge__: file_prediction.yaml
required: false
Expand Down
127 changes: 56 additions & 71 deletions src/methods/multi_omics/scenicplus/script.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,47 @@
import os
# import sys
# import yaml
# import pickle
# import tempfile
# import contextlib
# import hashlib
# import shutil
# import requests
# import traceback
# import subprocess
# import gc
# import gzip
# import tarfile
# from urllib.request import urlretrieve

# import numpy as np
# import scanpy as sc
# import pandas as pd
# import anndata
# import pyranges as pr
# from pycistarget.utils import region_names_to_coordinates
# from scenicplus.wrappers.run_pycistarget import run_pycistarget
# import polars
# import scrublet as scr
# from sklearn.manifold import TSNE
# import pycisTopic.loom
# from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
# from pycisTopic.iterative_peak_calling import get_consensus_peaks
# from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge
# from pycisTopic.qc import get_barcodes_passing_qc_for_sample
# from pycisTopic.lda_models import evaluate_models, run_cgs_models, run_cgs_models_mallet
# from pycisTopic.topic_binarization import binarize_topics
# from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
# from pycisTopic.diff_features import impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features
# from pycisTopic.utils import region_names_to_coordinates
# from pycisTopic.gene_activity import get_gene_activity
# from pycisTopic.loom import export_region_accessibility_to_loom, export_gene_activity_to_loom
# from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap
# import gzip
# import shutil
import sys
import yaml
import pickle
import tempfile
import contextlib
import hashlib
import shutil
import requests
import traceback
import subprocess
import gc
import gzip
import tarfile
import shutil
import numpy as np
import scanpy as sc
import pandas as pd
import anndata
import pyranges as pr
import pysam


from urllib.request import urlretrieve


from pycistarget.utils import region_names_to_coordinates
from scenicplus.wrappers.run_pycistarget import run_pycistarget
import polars
import scrublet as scr
from sklearn.manifold import TSNE
import pycisTopic.loom
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
from pycisTopic.iterative_peak_calling import get_consensus_peaks
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge
from pycisTopic.qc import get_barcodes_passing_qc_for_sample
from pycisTopic.lda_models import evaluate_models, run_cgs_models, run_cgs_models_mallet
from pycisTopic.topic_binarization import binarize_topics
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
from pycisTopic.diff_features import impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features
from pycisTopic.utils import region_names_to_coordinates
from pycisTopic.gene_activity import get_gene_activity
from pycisTopic.loom import export_region_accessibility_to_loom, export_gene_activity_to_loom
from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap

## VIASH START
par = {
Expand All @@ -52,36 +55,26 @@
'cell_topic': 'output/scenicplus/cell_topic.csv',
}
## VIASH END


# Bug in pycistopic: Import is missing in pycisTopic.loom,
# so TSNE must be dynamically added to the library's namespace.
# setattr(pycisTopic.loom, 'TSNE', TSNE)
# os.environ['MALLET_MEMORY'] = '200G'
setattr(pycisTopic.loom, 'TSNE', TSNE)
os.environ['MALLET_MEMORY'] = '200G'

atac_dir = f"output/gulack/"
print(atac_dir)
atac_dir = f"{par['temp_dir']}/atac/"
os.makedirs(atac_dir, exist_ok=True)
os.makedirs(f'{atac_dir}/atac', exist_ok=True)
raise ValueError('here')

# Get list of samples (e.g., donors)
print('Collect list of samples')
adata_atac = anndata.read_h5ad(par['multiomics_atac'])
unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories]
unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories]
del adata_atac

def process_atac(par):
print("---Run pre-process started ---", flush=True)
# Create one individual ATAC-seq file per donor
adata_atac = anndata.read_h5ad(par['multiomics_atac'])
unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories]
unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories]
fragments_dict = {}
for donor_id in unique_donor_ids:
filepath = os.path.join(atac_dir, f'{donor_id}.tsv')
if not os.path.exists(filepath):
print(f'Create tsv file {filepath}')

adata_atac.obs.cell_type = [s.replace(' ', '_') for s in adata_atac.obs.cell_type]
adata_atac.obs.donor_id = [s.replace(' ', '_') for s in adata_atac.obs.donor_id]
adata_atac_one_donor = adata_atac[adata_atac.obs.donor_id == donor_id]
Expand All @@ -102,31 +95,24 @@ def process_atac(par):
}
df = pd.DataFrame(d, copy=False)
df.to_csv(filepath, sep='\t', index=False, header=False)

# Compress tsv file
compressed_filepath = filepath + '.gz'
if not os.path.exists(compressed_filepath):

print(f'Sort and compress tsv file {filepath}')

# Sort file by genomic coordinates
sorted_filepath = filepath + '.sorted.tsv'
os.system(f'sort -k1,1 -k2,2n {filepath} > {sorted_filepath}')

# Compression
# subprocess.run(['bgzip', sorted_filepath, '-o', compressed_filepath])
with open(sorted_filepath, 'rb') as f_in:
with gzip.open(compressed_filepath, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

subprocess.run(['bgzip', sorted_filepath, '-o', compressed_filepath])
print(f'File compressed and saved as {compressed_filepath}')

fragments_dict[donor_id] = compressed_filepath

# Index file using tabix
if not os.path.exists(compressed_filepath + '.tbi'):
# # Index file using tabix
if not os.path.exists(compressed_filepath + '.tbi'): #pip install --user pytabix
print(f'Index compressed file {compressed_filepath} using tabix')
subprocess.run(['tabix', compressed_filepath, '-p', 'bed'])


# Collect cell metadata
print(f'Collect cell metadata')
Expand Down Expand Up @@ -175,7 +161,7 @@ def process_atac(par):
bed_path=os.path.join(par['temp_dir'], 'consensus_peak_calling/pseudobulk_bed_files'),
bigwig_path=os.path.join(par['temp_dir'], 'consensus_peak_calling/pseudobulk_bw_files'),
path_to_fragments=fragments_dict,
n_cpu=10,
n_cpu=par['num_workers'],
temp_dir=os.path.join(par['temp_dir'], 'consensus_peak_calling/tmp'),
split_pattern='-',
)
Expand All @@ -196,7 +182,7 @@ def process_atac(par):
bed_paths=bed_paths,
outdir=os.path.join(os.path.join(par['temp_dir'], 'consensus_peak_calling/MACS')),
genome_size='hs',
n_cpu=10,
n_cpu=par['num_workers'],
input_format='BEDPE',
shift=73,
ext_size=146,
Expand Down Expand Up @@ -244,7 +230,6 @@ def run_cistopic(par):
return
print("---Run cistopic---", flush=True)
if par['qc']: # Whether to perform quality control

# Compute QC metrics
print('Perform QC')
for donor_id in unique_donor_ids:
Expand Down Expand Up @@ -294,7 +279,7 @@ def run_cistopic(par):
path_to_blacklist=os.path.join(par['temp_dir'], 'hg38-blacklist.v2.bed'),
metrics=sample_metrics,
valid_bc=sample_id_to_barcodes_passing_filters[sample_id],
n_cpu=10,
n_cpu=par['num_workers'],
project=donor_id,
split_pattern='-'
)
Expand All @@ -308,7 +293,7 @@ def run_cistopic(par):
path_to_fragments=fragments_dict[donor_id],
path_to_regions=os.path.join(par['temp_dir'], 'consensus_peak_calling/consensus_regions.bed'),
path_to_blacklist=os.path.join(par['temp_dir'], 'hg38-blacklist.v2.bed'),
n_cpu=10,
n_cpu=par['num_workers'],
project=donor_id,
split_pattern='-'
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,22 @@ functionality:
GRN inference using scglue.
documentation_url: https://scglue.readthedocs.io/
arguments:
- name: --annotation_file
type: file
# default: resources/supplementary/gencode.v45.annotation.gtf.gz
example: resources_test/supplementary/gencode.v45.annotation.gtf.gz
- name: --top_n_targets
type: string
required: false
direction: input
- name: --motif_file
type: file
example: resources_test/supplementary/JASPAR2022-hg38.bed.gz
default: 100
- name: --rank_threshold
type: string
required: false
direction: input
default: 1500
- name: --nes_threshold
type: string
required: false
direction: input
default: 1.5


resources:
- type: python_script
Expand Down
85 changes: 23 additions & 62 deletions src/methods/multi_omics/scglue/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,22 +155,22 @@ def run_grn(par):
np.savetxt(f"{par['temp_dir']}/tfs.txt", tfs, fmt="%s")

#Construct the command
# command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom",
# f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv",
# '--seed', '0', '--num_workers', f"{par['num_workers']}",
# '--cell_id_attribute', 'obs_id', '--gene_attribute', 'name']
# print('Run grn')
# result = subprocess.run(command, check=True)
command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom",
f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv",
'--seed', '0', '--num_workers', f"{par['num_workers']}",
'--cell_id_attribute', 'obs_id', '--gene_attribute', 'name']
print('Run grn')
result = subprocess.run(command, check=True)

# print("Output:")
# print(result.stdout)
# print("Error:")
# print(result.stderr)
print("Output:")
print(result.stdout)
print("Error:")
print(result.stderr)

# if result.returncode == 0:
# print("Command executed successfully")
# else:
# print("Command failed with return code", result.returncode)
if result.returncode == 0:
print("Command executed successfully")
else:
print("Command failed with return code", result.returncode)


print("Generate TF cis-regulatory ranking bridged by ATAC peaks", flush=True)
Expand Down Expand Up @@ -217,30 +217,8 @@ def run_grn(par):
).to_csv(f"{par['temp_dir']}/ctx_annotation.tsv", sep="\t", index=False)
def prune_grn(par):
# Construct the command
#TODO: be sure that obs_id is in obs and name is in var
print("Run pscenic ctx", flush=True)
# command = [
# "pyscenic", "ctx",
# f"{par['temp_dir']}/draft_grn.csv",
# f"{par['temp_dir']}/glue.genes_vs_tracks.rankings.feather",
# f"{par['temp_dir']}/supp.genes_vs_tracks.rankings.feather",
# "--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv",
# "--expression_mtx_fname", f"{par['temp_dir']}/rna.loom",
# "--output", f"{par['temp_dir']}/pruned_grn.csv",
# "--rank_threshold", "10000",
# "--auc_threshold", "0.1",
# "--nes_threshold", "2",
# "--mask_dropouts",
# "--min_genes", "1",
# "--num_workers", f"{par['num_workers']}",
# "--cell_id_attribute", "obs_id",
# "--gene_attribute", "name"
# ]

par['genes_vs_motifs_500'] = 'output/scenic/databases/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
par['genes_vs_motifs_10k'] = 'output/scenic/databases/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
par['motif_annotation'] = 'output/scenic/databases/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'

print("Run pscenic ctx", flush=True)
command = [
"pyscenic", "ctx",
f"{par['temp_dir']}/draft_grn.csv",
Expand All @@ -249,14 +227,13 @@ def prune_grn(par):
"--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv",
"--expression_mtx_fname", f"{par['temp_dir']}/rna.loom",
"--output", f"{par['temp_dir']}/pruned_grn.csv",
# "--top_n_targets", "100",
# "--rank_threshold", "1500",
"--top_n_targets", str(par['top_n_targets']),
"--rank_threshold", str(par['rank_threshold']),
# "--auc_threshold", "0.1",
# "--nes_threshold", "0",
"--mask_dropouts",
"--nes_threshold", str(par['nes_threshold']),
"--min_genes", "1",
"--num_workers", f"{par['num_workers']}",
"--cell_id_attribute", "obs_id",
"--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
"--gene_attribute", "name"
]

Expand All @@ -272,22 +249,6 @@ def prune_grn(par):
else:
print("pyscenic ctx failed with return code", result.returncode)

# def download_prior(par):
# # get gene annotation
# response = requests.get("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz")
# par['annotation_file'] = f"{par['temp_dir']}/gencode.v45.annotation.gtf.gz"
# if response.status_code == 200:
# # Write the content to a file
# with open(par['annotation_file'], 'wb') as file:
# file.write(response.content)
# print(f"File downloaded and saved as gencode.v45.annotation.gtf.gz")
# else:
# print(f"Failed to download the gencode.v45.annotation.gtf.gz. Status code: {response.status_code}")



# annotation_file


def main(par):
import torch
Expand All @@ -299,10 +260,10 @@ def main(par):
rna = ad.read_h5ad(par['multiomics_rna'])
atac = ad.read_h5ad(par['multiomics_atac'])

# print('Preprocess data', flush=True)
# preprocess(rna, atac, par)
# print('Train a model', flush=True)
# training(par)
print('Preprocess data', flush=True)
preprocess(rna, atac, par)
print('Train a model', flush=True)
training(par)
run_grn(par)
prune_grn(par)
print('Curate predictions', flush=True)
Expand Down
Loading

0 comments on commit da9c8c4

Please sign in to comment.