Skip to content

Commit

Permalink
scenic sent for test run
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Sep 14, 2024
1 parent 56460b4 commit aaabbd9
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 34 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
.DS_Store

# related to files
.pybiomart.sqlite
logs/
params*
resources*
output/
Expand Down
11 changes: 11 additions & 0 deletions batch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
#SBATCH --time=12:00:00
#SBATCH --job-name=scenic_donor_0
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH --cpus-per-task=10
#SBATCH --mem=64G

singularity exec ../../images/scenic python src/methods/single_omics/scenic/script.py
11 changes: 0 additions & 11 deletions runs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3079,17 +3079,6 @@
"df_all['Rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n",
"df_all.style.background_gradient()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"df_reg1 = extract_data(data, reg='reg1')\n",
"df_reg2 = extract_data(data, reg='reg2')"
]
}
],
"metadata": {
Expand Down
142 changes: 124 additions & 18 deletions src/methods/single_omics/scenic/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,116 @@
import anndata
import numpy as np
import pandas as pd
# from contextlib_chdir import chdir
import subprocess
# from urllib.request import urlretrieve
import zipfile
from arboreto.algo import grnboost2
from distributed import Client
import ast

# wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather

# wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather


## VIASH START
par = {
'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad',
'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',
"tf_all": 'resources/prior/tf_all.csv',
'prediction': 'output/grnboost2/prediction.csv',
'max_n_links': 50000
'prediction': 'output/scenic/scenic.csv',
'temp_dir': 'output/scenic',
'max_workers': "10",
'motif_annotation': 'resources_local/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl',
'genes_vs_motifs_10k': 'resources_local/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather',
'genes_vs_motifs_500': 'resources_local/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather',

'max_n_links': 50000,
'seed': "32"
}
## VIASH END
os.makedirs(par['temp_dir'], exist_ok=True)

out_dir = 'resources_local'
RANKINGS_DB_PATH = os.path.join(out_dir, 'cistarget-db', 'db.regions_vs_motifs.rankings.feather')
SCORES_DB_PATH = os.path.join(out_dir, 'cistarget-db', 'db.regions_vs_motifs.scores.feather')

# if not (os.path.exists(RANKINGS_DB_PATH) and os.path.exists(SCORES_DB_PATH)):

# # Download create_cisTarget_databases
# os.makedirs(os.path.join(out_dir, 'cistarget-db'), exist_ok=True)
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'create_cisTarget_databases')):
# with chdir(os.path.join(out_dir, 'cistarget-db')):
# subprocess.run(['git', 'clone', 'https://github.com/aertslab/create_cisTarget_databases'])

# # Download cluster-buster
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'cbust')):
# urlretrieve('https://resources.aertslab.org/cistarget/programs/cbust', os.path.join(out_dir, 'cistarget-db', 'cbust'))
# subprocess.run(['chmod', 'a+x', os.path.join(out_dir, 'cistarget-db', 'cbust')])

# # Download motif collection
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public')):
# urlretrieve(
# 'https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/v10nr_clust_public.zip',
# os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public.zip')
# )
# with zipfile.ZipFile(os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public.zip'), 'r') as zip_ref:
# zip_ref.extractall(os.path.join(out_dir, 'cistarget-db'))

# # Download chromosome sizes
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.chrom.sizes')):
# urlretrieve(
# 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes',
# os.path.join(out_dir, 'cistarget-db', 'hg38.chrom.sizes')
# )

# # Download reference genome
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.fa')):
# print('Downloading reference genome', flush=True)
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.fa.gz')):
# urlretrieve(
# 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz',
# os.path.join(out_dir, 'cistarget-db', 'hg38.fa.gz')
# )
# with gzip.open(os.path.join(out_dir, 'cistarget-db', 'hg38.fa.gz'), 'rb') as f_in:
# with open(os.path.join(out_dir, 'cistarget-db', 'hg38.fa'), 'wb') as f_out:
# shutil.copyfileobj(f_in, f_out)

# # Prepare fasta from consensus regions
# if not os.path.exists(os.path.join(out_dir, 'cistarget-db', 'hg38.with_1kb_bg_padding.fa')):
# subprocess.run([
# os.path.join(out_dir, 'cistarget-db', 'create_cisTarget_databases', 'create_fasta_with_padded_bg_from_bed.sh'),
# os.path.join(out_dir, 'cistarget-db', 'hg38.fa'),
# os.path.join(out_dir, 'cistarget-db', 'hg38.chrom.sizes'),
# os.path.join(out_dir, 'consensus_peak_calling', 'consensus_regions.bed'),
# os.path.join(out_dir, 'cistarget-db', 'hg38.with_1kb_bg_padding.fa'),
# '1000',
# 'yes'
# ])

# # Create cistarget databases
# with open(os.path.join(out_dir, 'cistarget-db', 'motifs.txt'), 'w') as f:
# for filename in os.listdir(os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public', 'singletons')):
# f.write(f'{filename}\n')
# with chdir(os.path.join(out_dir, 'cistarget-db')):
# subprocess.run([
# 'python',
# os.path.join(out_dir, 'cistarget-db', 'create_cisTarget_databases', 'create_cistarget_motif_databases.py'),
# '-f', os.path.join(out_dir, 'cistarget-db', 'hg38.with_1kb_bg_padding.fa'),
# '-M', os.path.join(out_dir, 'cistarget-db', 'v10nr_clust_public', 'singletons'),
# '-m', os.path.join(out_dir, 'cistarget-db', 'motifs.txt'),
# '-c', os.path.join(out_dir, 'cistarget-db', 'cbust'),
# '-o', 'db',
# '--bgpadding', '1000',
# '-t', str(par['num_workers'])
# ])




# Load scRNA-seq data
adata_rna = anndata.read_h5ad(par['multiomics_rna'])
adata_rna = adata_rna[adata_rna.obs.donor_id=='donor_0',] #TODO: to go
gene_names = adata_rna.var.gene_ids.index.to_numpy()
X = adata_rna.X.toarray()

Expand All @@ -28,42 +122,54 @@
# tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)]

# format output
expression_data = f"{par['temp_dir']}/expression_data.tsv"
expression_data = os.path.join(par['temp_dir'], "expression_data.tsv")

pd.DataFrame(X, columns=gene_names).to_csv(expression_data, sep='\t', index=False)

expr_mat_adjacencies = f"{par['temp_dir']}/expr_mat_adjacencies.tsv"
expr_mat_adjacencies = os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv")


print('Run grn')
command = [
"pyscenic", "grn",
"--num_workers", par['max_workers'],
"--seed", par['seed'],
"-o", expr_mat_adjacencies,
expression_data,
par['tf_all']
par['tf_all'],

]

# Run grn
import subprocess
subprocess.run(command, check=True)


# Run prune
print('Run prune')
regulons = f"{par['temp_dir']}/regulons.csv"
annotations_fname = "/data/motifs-v9-nr.hgnc-m0.001-o0.0.tbl"
ranking_1 = "/data/hg19-tss-centered-5kb-7species.mc9nr.genes_vs_motifs.rankings.feather "
ranking_2 = /data/hg19-tss-centered-10kb-7species.mc9nr.genes_vs_motifs.rankings.feather
command = [
"pyscenic", "ctx",
expr_mat_adjacencies, ranking_1, ranking_2,
"--annotations_fname", annotations_fname,
expr_mat_adjacencies, par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'],
"--annotations_fname", par['motif_annotation'],
"--expression_mtx_fname", expression_data,
"--mode", "custom_multiprocessing",
"--output", regulons,
"--num_workers", par['max_workers']
]
subprocess.run(command, check=True)

# Save inferred GRN
print(expr_mat_adjacencies)
network = pd.read_csv(expr_mat_adjacencies, sep='\t')
print('Format regulons')
regulons = pd.read_csv(regulons, index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:]

def format_it(df):
values = df['TargetGenes'].values[0]
genes = []
weights = []
for value in ast.literal_eval(values):
genes.append(value[0])
weights.append(value[1])
return pd.DataFrame({'target':genes, 'weight':weights})
grn = regulons.groupby('index').apply(lambda df: format_it(df)).reset_index().rename(columns={'index':'source'})
network = grn[['source','target','weight']]


network.to_csv(par['prediction'], sep=',')

print('Finished.')
Expand Down
11 changes: 6 additions & 5 deletions src/methods/single_omics/scgpt/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,13 @@

## VIASH START
par = {
'multiomics_rna': '../input/resources_test/grn-benchmark/multiomics_rna.h5ad',
'tf_all': '../input/resources_test/prior/tf_all.csv',
'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad',
'tf_all': 'resources_test/prior/tf_all.csv',
'prediction': 'output/prediction_scgpt.csv',
'max_n_links': 50000,
'model_file': '../input/resources_test/supplementary/finetuned_scGPT_adamson/best_model.pt',
'model_config_file': '../input/resources_test/supplementary/finetuned_scGPT_adamson/args.json',
'vocab_file': '../input/resources_test/supplementary/finetuned_scGPT_adamson/vocab.json',
'model_file': 'resources_test/supplementary/finetuned_scGPT_adamson/best_model.pt',
'model_config_file': 'resources_test/supplementary/finetuned_scGPT_adamson/args.json',
'vocab_file': 'resources_test/supplementary/finetuned_scGPT_adamson/vocab.json',
'n_bins': 51,
'batch_size': 16,
'condition': 'cell_type'
Expand Down Expand Up @@ -142,6 +142,7 @@ def monitor_memory():
print('Process rna-seq file')
import scanpy as sc
adata = sc.read(par['multiomics_rna'])
adata.X = adata.X.todense()
adata.obs["celltype"] = adata.obs["cell_type"].astype("category")
adata.obs["str_batch"] = adata.obs["donor_id"].astype(str)
data_is_raw = False
Expand Down

0 comments on commit aaabbd9

Please sign in to comment.