diff --git a/src/methods/multi_omics/pycistopic/config.novsh.yaml b/src/methods/multi_omics/pycistopic/config.novsh.yaml deleted file mode 100644 index e0a9e442e..000000000 --- a/src/methods/multi_omics/pycistopic/config.novsh.yaml +++ /dev/null @@ -1,52 +0,0 @@ -__merge__: ../../../api/comp_method.yaml - - -functionality: - name: pycistopic - info: - label: pycistopic - summary: "Cis-regulatory topic modeling using pycisTopic" - description: | - Cis-regulatory topic modeling using pycisTopic. - documentation_url: https://pycistopic.readthedocs.io/en/latest/ - arguments: - - name: --qc - type: boolean_true - description: Whether to perform quality control - - name: --out_dir - type: file - direction: output - create_parent: true - default: 'output/pycistopic' - - name: --region_bin_topics_otsu - type: file - direction: output - default: 'output/pycistopic/candidate_enhancers/region_bin_topics_otsu.pkl' - - name: --region_bin_topics_top3k - type: file - direction: output - default: 'output/pycistopic/candidate_enhancers/region_bin_topics_top3k.pkl' - - name: --markers_dict - type: file - direction: output - default: 'output/pycistopic/candidate_enhancers/markers_dict.pkl' - - name: --cistopic_object - type: file - direction: output - default: 'output/pycistopic/cistopic_object_with_model.pkl' - - resources: - - type: python_script - path: script.py - -platforms: - - type: docker - image: apassemi/scenicplus:latest - setup: - - type: docker - run: | - export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64/ - - type: native - - type: nextflow - directives: - label: [midtime,himem,hicpu] diff --git a/src/methods/multi_omics/pycistopic/script.py b/src/methods/multi_omics/pycistopic/script.py deleted file mode 100644 index e7ab60bd8..000000000 --- a/src/methods/multi_omics/pycistopic/script.py +++ /dev/null @@ -1,555 +0,0 @@ -import os -import subprocess -import gc -import sys -import shutil -import gzip -import tarfile -import pickle -import requests - -import polars -import scrublet as scr -import numpy as np -import pandas as pd -import anndata -import pyranges as pr -from sklearn.manifold import TSNE -import pycisTopic.loom -from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling -from pycisTopic.iterative_peak_calling import get_consensus_peaks -from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge -from pycisTopic.qc import get_barcodes_passing_qc_for_sample -from pycisTopic.lda_models import evaluate_models, run_cgs_models, run_cgs_models_mallet -from pycisTopic.topic_binarization import binarize_topics -from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation -from pycisTopic.diff_features import impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features -from pycisTopic.utils import region_names_to_coordinates -from pycisTopic.gene_activity import get_gene_activity -from pycisTopic.loom import export_region_accessibility_to_loom, export_gene_activity_to_loom -from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap - - -## VIASH START -par = { - 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', - 'out_dir': 'output/pycistopic' - 'qc': False -} -## VIASH END - -# Bug in pycistopic: Import is missing in pycisTopic.loom, -# so TSNE must be dynamically added to the library's namespace. -setattr(pycisTopic.loom, 'TSNE', TSNE) - -os.environ['MALLET_MEMORY'] = '200G' - -sys.path.append(meta['resources_dir']) -os.makedirs(par['out_dir'], exist_ok=True) -atac_dir = os.path.join(par['out_dir'], 'atac') -os.makedirs(atac_dir, exist_ok=True) -out_dir = par['out_dir'] - -# Get list of samples (e.g., donors) -print('Collect list of samples') -adata_atac = anndata.read_h5ad(par['multiomics_atac']) -unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories] -unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories] -del adata_atac - -# Create one individual ATAC-seq file per donor -fragments_dict = {} -for donor_id in unique_donor_ids: - filepath = os.path.join(atac_dir, f'{donor_id}.tsv') - if not os.path.exists(filepath): - print(f'Create tsv file {filepath}') - adata_atac = anndata.read_h5ad(par['multiomics_atac']) - adata_atac.obs.cell_type = [s.replace(' ', '_') for s in adata_atac.obs.cell_type] - adata_atac.obs.donor_id = [s.replace(' ', '_') for s in adata_atac.obs.donor_id] - adata_atac_one_donor = adata_atac[adata_atac.obs.donor_id == donor_id] - coo_matrix = adata_atac_one_donor.X.tocoo() - rows = coo_matrix.row - cols = coo_matrix.col - counts = coo_matrix.data - row_names = adata_atac_one_donor.obs.index[rows] - coord_names = adata_atac_one_donor.var.index[cols] - del adata_atac, adata_atac_one_donor - gc.collect() - d = { - 'chromosome': np.asarray([s.split(':')[0] for s in coord_names]), - 'start': np.asarray([int(s.split(':')[1].split('-')[0]) for s in coord_names], dtype=np.uint32), - 'end': np.asarray([int(s.split(':')[1].split('-')[1]) for s in coord_names], dtype=np.uint32), - 'barcode': [barcode.replace('-', '') for barcode in row_names], - 'count': np.squeeze(np.asarray(counts.astype(np.uint16))) - } - df = pd.DataFrame(d, copy=False) - df.to_csv(filepath, sep='\t', index=False, header=False) - - # Compress tsv file - compressed_filepath = filepath + '.gz' - if not os.path.exists(compressed_filepath): - - print(f'Sort and compress tsv file {filepath}') - - # Sort file by genomic coordinates - sorted_filepath = filepath + '.sorted.tsv' - os.system(f'sort -k1,1 -k2,2n {filepath} > {sorted_filepath}') - - # Compression - subprocess.run(['bgzip', sorted_filepath, '-o', compressed_filepath]) - - fragments_dict[donor_id] = compressed_filepath - - # Index file using tabix - if not os.path.exists(compressed_filepath + '.tbi'): - print(f'Index compressed file {compressed_filepath} using tabix') - subprocess.run(['tabix', compressed_filepath, '-p', 'bed']) - -# Collect cell metadata -print(f'Collect cell metadata') -adata_atac = anndata.read_h5ad(par['multiomics_atac']) -donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id] -index = [barcode.replace('-', '') + '-' + donor_id for donor_id, barcode in zip(donor_ids, adata_atac.obs.index)] -cell_data = pd.DataFrame({ - 'cell_type': [s.replace(' ', '_') for s in adata_atac.obs.cell_type.to_numpy()], - 'donor_id': [s.replace(' ', '_') for s in adata_atac.obs.donor_id.to_numpy()] -}, index=index, copy=False) - -# Get chromosome sizes -print(f'Download chromosome sizes from UCSC') -chromsizes = pd.read_table( - 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes', - header=None, - names=['Chromosome', 'End'] -) -chromsizes.insert(1, 'Start', 0) - -print(f'Start pseudobulking') -os.makedirs(os.path.join(out_dir, 'consensus_peak_calling'), exist_ok=True) -os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files'), exist_ok=True) -os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bw_files'), exist_ok=True) -os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/MACS'), exist_ok=True) -os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/tmp'), exist_ok=True) - -# Check if individual fragment files are available for each cell type -valid = True -bed_paths = {} -for cell_type in unique_cell_types: - filepath = os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files', f'{cell_type}.fragments.tsv.gz') - bed_paths[cell_type] = filepath - if not os.path.exists(filepath): - valid = False - -# Perform pseudobulking for each cell type -if not valid: - print('Pseudobulk each cell type') - _, bed_paths = export_pseudobulk( - input_data=cell_data, - variable='cell_type', - sample_id_col='donor_id', - chromsizes=chromsizes, - bed_path=os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files'), - bigwig_path=os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bw_files'), - path_to_fragments=fragments_dict, - n_cpu=10, - temp_dir=os.path.join(out_dir, 'consensus_peak_calling/tmp'), - split_pattern='-', - ) - with open(os.path.join(out_dir, 'consensus_peak_calling/bed_paths.tsv'), 'wt') as f: - for v in bed_paths: - _ = f.write(f'{v}\t{bed_paths[v]}\n') - -# Load paths to pseudobulked samples -bed_paths = {} -with open(os.path.join(out_dir, 'consensus_peak_calling/bed_paths.tsv')) as f: - for line in f: - v, p = line.strip().split("\t") - bed_paths.update({v: p}) - -# Call peaks using MACS2 -narrow_peak_dict = peak_calling( - macs_path = 'macs2', - bed_paths=bed_paths, - outdir=os.path.join(os.path.join(out_dir, 'consensus_peak_calling/MACS')), - genome_size='hs', - n_cpu=10, - input_format='BEDPE', - shift=73, - ext_size=146, - keep_dup='all', - q_value=0.05 -) - -# Download list of blacklist regions -if not os.path.exists(os.path.join(out_dir, 'hg38-blacklist.v2.bed')): - print('Download list of blacklist regions') - url = 'https://raw.githubusercontent.com/aertslab/pycisTopic/d6a2f8c832c14faae07def1d3ad8755531f50ad5/blacklist/hg38-blacklist.v2.bed' - response = requests.get(url) - with open(os.path.join(out_dir, 'hg38-blacklist.v2.bed'), 'w') as f: - f.write(response.text) - -# Consensus peak calling -consensus_peaks = get_consensus_peaks( - narrow_peaks_dict=narrow_peak_dict, - peak_half_width=250, - chromsizes=chromsizes, - path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed') -) -consensus_peaks.to_bed( - path=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), - keep=True, - compression='infer', - chain=False -) - -# Download TSS annotations -os.makedirs(os.path.join(out_dir, 'qc'), exist_ok=True) -if not os.path.exists(os.path.join(out_dir, 'qc', 'tss.bed')): - subprocess.run([ - 'pycistopic', 'tss', 'get_tss', - '--output', os.path.join(out_dir, 'qc', 'tss.bed'), - '--name', 'hsapiens_gene_ensembl', - '--to-chrom-source', 'ucsc', - '--ucsc', 'hg38' - ]) - -# Create cistopic objects -if not os.path.exists(os.path.join(out_dir, 'cistopic_obj.pkl')): - - if par['qc']: # Whether to perform quality control - - # Compute QC metrics - print('Perform QC') - for donor_id in unique_donor_ids: - filepath = os.path.join(atac_dir, f'{donor_id}.tsv.gz') - subprocess.run([ - 'pycistopic', 'qc', - '--fragments', os.path.join(out_dir, 'qc', 'tss.bed'), - '--regions', os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), - '--tss', os.path.join(out_dir, 'qc', 'tss.bed'), - '--output', os.path.join(out_dir, 'qc', donor_id), - '--tss_flank_window', '10000', # Default: 2000 - '--tss_smoothing_rolling_window', '60', # Default: 10 - '--tss_minimum_signal_window', '5', # Default: 100 - '--tss_window', '25', # Default: 50 - '--tss_min_norm', '0.5', # Default: 0.2 - '--min_fragments_per_cb', '30', # Default: 10 - '--threads', '10' - ]) - - # Filter out low quality cells - print('Filter out low quality cells') - sample_id_to_barcodes_passing_filters = {} - sample_id_to_thresholds = {} - for donor_id in fragments_dict.keys(): - ( - sample_id_to_barcodes_passing_filters[donor_id], - sample_id_to_thresholds[donor_id] - ) = get_barcodes_passing_qc_for_sample( - sample_id=donor_id, - pycistopic_qc_output_dir=os.path.join(out_dir, 'qc'), - unique_fragments_threshold=None, # use automatic thresholding - tss_enrichment_threshold=None, # use automatic thresholding - frip_threshold=0, - use_automatic_thresholds=True, - ) - - # Create cistopic objects - print('Create cisTopic objects') - cistopic_obj_list = [] - for donor_id in unique_donor_ids: - sample_metrics = pl.read_parquet( - os.path.join(out_dir, 'qc', donor_id, f'{donor_id}.fragments_stats_per_cb.parquet') - ).to_pandas().set_index('CB').loc[sample_id_to_barcodes_passing_filters[sample_id]] - cistopic_obj = create_cistopic_object_from_fragments( - path_to_fragments=fragments_dict[donor_id], - path_to_regions=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), - path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed'), - metrics=sample_metrics, - valid_bc=sample_id_to_barcodes_passing_filters[sample_id], - n_cpu=10, - project=donor_id, - split_pattern='-' - ) - cistopic_obj_list.append(cistopic_obj) - else: - - # Skip QC and create cisTopic objects - print('Create cisTopic objects without QC') - cistopic_obj_list = [] - for donor_id in unique_donor_ids: - cistopic_obj = create_cistopic_object_from_fragments( - path_to_fragments=fragments_dict[donor_id], - path_to_regions=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), - path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed'), - n_cpu=10, - project=donor_id, - split_pattern='-' - ) - cistopic_obj_list.append(cistopic_obj) - - # Add metadata to cistopic objects - for i in range(len(cistopic_obj_list)): - cistopic_obj_list[i].add_cell_data(cell_data, split_pattern='-') - - # Infer doublets using scrublet - print('Infer doublets using scrublet') - for i in range(len(cistopic_obj_list)): - scrub = scr.Scrublet(cistopic_obj_list[i].fragment_matrix.T, expected_doublet_rate=0.1) - doublet_scores, predicted_doublets = scrub.scrub_doublets() - scrub.plot_histogram(); - scrub.call_doublets(threshold=0.22) - scrub.plot_histogram(); - scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], columns=cistopic_obj.cell_names, index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T - cistopic_obj_list[i].add_cell_data(scrublet, split_pattern='-') - - # Combine samples into a single cistopic object - if len(cistopic_obj_list) == 1: - cistopic_obj = cistopic_obj_list[0] - else: - cistopic_obj = merge(cistopic_obj_list, is_acc=1, copy=False, split_pattern='-') - - # Save cistopic objects - with open(os.path.join(out_dir, 'cistopic_obj.pkl'), 'wb') as f: - pickle.dump(cistopic_obj, f) -else: - # Load cistopic objects - with open(os.path.join(out_dir, 'cistopic_obj.pkl'), 'rb') as f: - cistopic_obj = pickle.load(f) - -# Download Mallet -MALLET_PATH = os.path.join(out_dir, 'Mallet-202108', 'bin', 'mallet') -if not os.path.exists(MALLET_PATH): - url = 'https://github.com/mimno/Mallet/releases/download/v202108/Mallet-202108-bin.tar.gz' - response = requests.get(url) - with open(os.path.join(out_dir, 'Mallet-202108-bin.tar.gz'), 'wb') as f: - f.write(response.content) - with tarfile.open(os.path.join(out_dir, 'Mallet-202108-bin.tar.gz'), 'r:gz') as f: - f.extractall(path=out_dir) - -# LDA-based topic modeling -print('Run LDA models') -filepath = os.path.join(out_dir, 'cistopic_object_with_model.pkl') -if not os.path.exists(filepath): - - # Topic modeling - n_topics = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] - if os.path.exists(MALLET_PATH): - models = run_cgs_models_mallet( - cistopic_obj, - n_topics=n_topics, - n_cpu=12, - n_iter=500, - random_state=555, - alpha=50, - alpha_by_topic=True, - eta=0.1, - eta_by_topic=False, - mallet_path=MALLET_PATH - ) - else: - print('Could not find Mallet. Running the sequential version of LDA instead.') - models = run_cgs_models( - cistopic_obj, - n_topics=n_topics, - n_cpu=12, - n_iter=500, - random_state=555, - alpha=50, - alpha_by_topic=True, - eta=0.1, - eta_by_topic=False - ) - - # Model selection - model = evaluate_models(models, select_model=40, return_model=True) - cistopic_obj.add_LDA_model(model) - - with open(filepath, 'wb') as f: - pickle.dump(cistopic_obj, f) -else: - with open(filepath, 'rb') as f: - cistopic_obj = pickle.load(f) - -chromsizes = pd.read_table(os.path.join(out_dir, "qc", "hg38.chrom_sizes_and_alias.tsv")) -chromsizes.rename({"# ucsc": "Chromosome", "length": "End"}, axis = 1, inplace = True) -chromsizes['Start'] = 0 -chromsizes = pr.PyRanges(chromsizes[['Chromosome', 'Start', 'End']]) - -# Find clusters -find_clusters( - cistopic_obj, - target='cell', - k=10, - res=[0.6, 1.2, 3], - scale=True, - split_pattern='-' -) - -# 2D projections -run_umap(cistopic_obj, target='cell', scale=True) -run_tsne(cistopic_obj, target='cell', scale=True) - -# Topic binarization -region_bin_topics_top_3k = binarize_topics(cistopic_obj, method='ntop', ntop=3_000) -region_bin_topics_otsu = binarize_topics(cistopic_obj, method='otsu') -binarized_cell_topic = binarize_topics(cistopic_obj, target='cell', method='li') - -# Topic annotation -topic_annot = topic_annotation( - cistopic_obj, - annot_var='cell_type', - binarized_cell_topic=binarized_cell_topic, - general_topic_thr=0.2 -) - -# Identify differentially accessible regions -imputed_acc_obj = impute_accessibility( - cistopic_obj, - selected_cells=None, - selected_regions=None, - scale_factor=10**6 -) -normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4) -variable_regions = find_highly_variable_features( - normalized_imputed_acc_obj, - min_disp=0.05, - min_mean=0.0125, - max_mean=3, - max_disp=np.inf, - n_bins=20, - n_top_features=None, - plot=False -) -markers_dict = find_diff_features( - cistopic_obj, - imputed_acc_obj, - variable='cell_type', - var_features=variable_regions, - contrasts=None, - adjpval_thr=0.05, - log2fc_thr=np.log2(1.5), - n_cpu=5, - split_pattern='-' -) - -# Save topics -folder = os.path.join(out_dir, 'region_sets', 'Topics_otsu') -os.makedirs(folder, exist_ok=True) -for topic in binarized_cell_topic: - region_names_to_coordinates( - region_bin_topics_otsu[topic].index - ).sort_values( - ['Chromosome', 'Start', 'End'] - ).to_csv( - os.path.join(folder, f'{topic}.bed'), - sep='\t', - header=False, index=False - ) -folder = os.path.join(out_dir, 'region_sets', 'Topics_top_3k') -os.makedirs(folder, exist_ok=True) -for topic in region_bin_topics_top_3k: - region_names_to_coordinates( - region_bin_topics_top_3k[topic].index - ).sort_values( - ['Chromosome', 'Start', 'End'] - ).to_csv( - os.path.join(folder, f'{topic}.bed'), - sep='\t', - header=False, index=False - ) -folder = os.path.join(out_dir, 'candidate_enhancers') -os.makedirs(folder, exist_ok=True) -with open(os.path.join(folder, 'region_bin_topics_otsu.pkl'), 'wb') as f: - pickle.dump(region_bin_topics_otsu, f) -with open(os.path.join(folder, 'region_bin_topics_top3k.pkl'), 'wb') as f: - pickle.dump(region_bin_topics_top_3k, f) - -# Save DARs -folder = os.path.join(out_dir, 'region_sets', 'DARs_cell_type') -os.makedirs(folder, exist_ok=True) -for cell_type in markers_dict: - region_names_to_coordinates( - markers_dict[cell_type].index - ).sort_values( - ['Chromosome', 'Start', 'End'] - ).to_csv( - os.path.join(folder, f'{cell_type}.bed'), - sep='\t', - header=False, - index=False - ) -folder = os.path.join(out_dir, 'candidate_enhancers') -with open(os.path.join(folder, 'markers_dict.pkl'), 'wb') as f: - pickle.dump(markers_dict, f) - -# Get gene activity -pr_annotation = pd.read_table( - os.path.join(out_dir, 'qc', 'tss.bed') -).rename( - {'Name': 'Gene', '# Chromosome': 'Chromosome'}, axis=1) -pr_annotation['Transcription_Start_Site'] = pr_annotation['Start'] -pr_annotation = pr.PyRanges(pr_annotation) -gene_act, weights = get_gene_activity( - imputed_acc_obj, - pr_annotation, - chromsizes, - use_gene_boundaries=True, - upstream=[1000, 100000], - downstream=[1000, 100000], - distance_weight=True, - decay_rate=1, - extend_gene_body_upstream=10000, - extend_gene_body_downstream=500, - gene_size_weight=False, - gene_size_scale_factor='median', - remove_promoters=False, - average_scores=True, - scale_factor=1, - extend_tss=[10, 10], - gini_weight = True, - return_weights= True -) - -# Infer differentially accessible genes -DAG_markers_dict= find_diff_features( - cistopic_obj, - gene_act, - variable='cell_type', - var_features=None, - contrasts=None, - adjpval_thr=0.05, - log2fc_thr=np.log2(1.5), - n_cpu=5, - split_pattern='-' -) - -# Exporting to loom -os.makedirs(os.path.join(out_dir, 'loom'), exist_ok=True) -cluster_markers = {'cell_type': markers_dict} -filepath = os.path.join(out_dir, 'loom', f'region-accessibility.loom') -print(f'Saving region accessibility to {filepath}') -export_region_accessibility_to_loom( - accessibility_matrix=imputed_acc_obj, - cistopic_obj=cistopic_obj, - binarized_topic_region=region_bin_topics_otsu, - binarized_cell_topic=binarized_cell_topic, - out_fname=filepath, - cluster_annotation=['cell_type'], - cluster_markers=cluster_markers, - nomenclature='hg38', - split_pattern='-' -) -filepath = os.path.join(out_dir, 'loom', f'gene-activity.loom') -print(f'Saving gene activity to {filepath}') -export_gene_activity_to_loom( - gene_activity_matrix=gene_act, - cistopic_obj=cistopic_obj, - out_fname=filepath, - cluster_annotation=['cell_type'], - cluster_markers=cluster_markers, - nomenclature='hg38', - split_pattern='-' -) - -print('Finished.') diff --git a/src/methods/multi_omics/pycistopic/test.sh b/src/methods/multi_omics/pycistopic/test.sh deleted file mode 100644 index fccfacdce..000000000 --- a/src/methods/multi_omics/pycistopic/test.sh +++ /dev/null @@ -1 +0,0 @@ -viash build src/methods/pycistopic/config.vsh.yaml --platform docker -o bin/pycistopic && bin/pycistopic/pycistopic --multiomics_atac resources/resources_test/grn-benchmark/multiomics_atac.h5ad --out_dir output/pycistopic \ No newline at end of file diff --git a/src/methods/multi_omics/scenicplus/config.vsh.yaml b/src/methods/multi_omics/scenicplus/config.vsh.yaml index bbb78b760..2c8af4067 100644 --- a/src/methods/multi_omics/scenicplus/config.vsh.yaml +++ b/src/methods/multi_omics/scenicplus/config.vsh.yaml @@ -8,12 +8,18 @@ functionality: label: scenicplus summary: "GRN inference using scenicplus" description: | - GRN inference using scenicplus + GRN inference using scenicplus. documentation_url: https://scenicplus.readthedocs.io/en/latest/human_cerebellum.html arguments: - name: --cistopic_object type: file + required: true direction: output + description: "Path to the pycistopic serialized (.pkl) object." + - name: --qc + type: boolean + default: false + description: "Whether to perform quality control." resources: - type: python_script path: script.py diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py index 16805f070..ebb36b08d 100644 --- a/src/methods/multi_omics/scenicplus/script.py +++ b/src/methods/multi_omics/scenicplus/script.py @@ -2,10 +2,16 @@ import sys import yaml import pickle +import tempfile import contextlib import hashlib +import shutil import requests +import traceback import subprocess +import gc +import gzip +import tarfile from urllib.request import urlretrieve import numpy as np @@ -15,10 +21,556 @@ import pyranges as pr from pycistarget.utils import region_names_to_coordinates from scenicplus.wrappers.run_pycistarget import run_pycistarget +import polars +import scrublet as scr +from sklearn.manifold import TSNE +import pycisTopic.loom +from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling +from pycisTopic.iterative_peak_calling import get_consensus_peaks +from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, merge +from pycisTopic.qc import get_barcodes_passing_qc_for_sample +from pycisTopic.lda_models import evaluate_models, run_cgs_models, run_cgs_models_mallet +from pycisTopic.topic_binarization import binarize_topics +from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation +from pycisTopic.diff_features import impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features +from pycisTopic.utils import region_names_to_coordinates +from pycisTopic.gene_activity import get_gene_activity +from pycisTopic.loom import export_region_accessibility_to_loom, export_gene_activity_to_loom +from pycisTopic.clust_vis import find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap ## VIASH START par = { +<<<<<<< HEAD + 'multiomics_rna': '/base/resources/resources_test/grn-benchmark/multiomics_rna.h5ad', + 'multiomics_atac': '/base/resources/resources_test/grn-benchmark/multiomics_atac.h5ad', + 'temp_dir': '/base/output/scenicplus', + 'pycistopic_object': '/base/output/scenicplus/pycistopic.pkl', + 'prediction': '/base/output/prediction.csv', + 'qc': False, + 'num_workers': 4 +} +## VIASH END + + +# Bug in pycistopic: Import is missing in pycisTopic.loom, +# so TSNE must be dynamically added to the library's namespace. +setattr(pycisTopic.loom, 'TSNE', TSNE) + +os.environ['MALLET_MEMORY'] = '200G' +try: + sys.path.append(meta['resources_dir']) +except NameError: + pass + +out_dir = par['temp_dir'] +atac_dir = os.path.join(out_dir, 'atac') +os.makedirs(atac_dir, exist_ok=True) + +# Get list of samples (e.g., donors) +print('Collect list of samples') +adata_atac = anndata.read_h5ad(par['multiomics_atac']) +unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories] +unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories] +del adata_atac + +# Create one individual ATAC-seq file per donor +fragments_dict = {} +for donor_id in unique_donor_ids: + filepath = os.path.join(atac_dir, f'{donor_id}.tsv') + if not os.path.exists(filepath): + print(f'Create tsv file {filepath}') + adata_atac = anndata.read_h5ad(par['multiomics_atac']) + adata_atac.obs.cell_type = [s.replace(' ', '_') for s in adata_atac.obs.cell_type] + adata_atac.obs.donor_id = [s.replace(' ', '_') for s in adata_atac.obs.donor_id] + adata_atac_one_donor = adata_atac[adata_atac.obs.donor_id == donor_id] + coo_matrix = adata_atac_one_donor.X.tocoo() + rows = coo_matrix.row + cols = coo_matrix.col + counts = coo_matrix.data + row_names = adata_atac_one_donor.obs.index[rows] + coord_names = adata_atac_one_donor.var.index[cols] + del adata_atac, adata_atac_one_donor + gc.collect() + d = { + 'chromosome': np.asarray([s.split(':')[0] for s in coord_names]), + 'start': np.asarray([int(s.split(':')[1].split('-')[0]) for s in coord_names], dtype=np.uint32), + 'end': np.asarray([int(s.split(':')[1].split('-')[1]) for s in coord_names], dtype=np.uint32), + 'barcode': [barcode.replace('-', '') for barcode in row_names], + 'count': np.squeeze(np.asarray(counts.astype(np.uint16))) + } + df = pd.DataFrame(d, copy=False) + df.to_csv(filepath, sep='\t', index=False, header=False) + + # Compress tsv file + compressed_filepath = filepath + '.gz' + if not os.path.exists(compressed_filepath): + + print(f'Sort and compress tsv file {filepath}') + + # Sort file by genomic coordinates + sorted_filepath = filepath + '.sorted.tsv' + os.system(f'sort -k1,1 -k2,2n {filepath} > {sorted_filepath}') + + # Compression + subprocess.run(['bgzip', sorted_filepath, '-o', compressed_filepath]) + + fragments_dict[donor_id] = compressed_filepath + + # Index file using tabix + if not os.path.exists(compressed_filepath + '.tbi'): + print(f'Index compressed file {compressed_filepath} using tabix') + subprocess.run(['tabix', compressed_filepath, '-p', 'bed']) + +# Collect cell metadata +print(f'Collect cell metadata') +adata_atac = anndata.read_h5ad(par['multiomics_atac']) +donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id] +index = [barcode.replace('-', '') + '-' + donor_id for donor_id, barcode in zip(donor_ids, adata_atac.obs.index)] +cell_data = pd.DataFrame({ + 'cell_type': [s.replace(' ', '_') for s in adata_atac.obs.cell_type.to_numpy()], + 'donor_id': [s.replace(' ', '_') for s in adata_atac.obs.donor_id.to_numpy()] +}, index=index, copy=False) + +# Get chromosome sizes +print(f'Download chromosome sizes from UCSC') +chromsizes = pd.read_table( + 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes', + header=None, + names=['Chromosome', 'End'] +) +chromsizes.insert(1, 'Start', 0) + +print(f'Start pseudobulking') +os.makedirs(os.path.join(out_dir, 'consensus_peak_calling'), exist_ok=True) +os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files'), exist_ok=True) +os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bw_files'), exist_ok=True) +os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/MACS'), exist_ok=True) +os.makedirs(os.path.join(out_dir, 'consensus_peak_calling/tmp'), exist_ok=True) + +# Check if individual fragment files are available for each cell type +valid = True +bed_paths = {} +for cell_type in unique_cell_types: + filepath = os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files', f'{cell_type}.fragments.tsv.gz') + bed_paths[cell_type] = filepath + if not os.path.exists(filepath): + valid = False + +# Perform pseudobulking for each cell type +if not valid: + print('Pseudobulk each cell type') + _, bed_paths = export_pseudobulk( + input_data=cell_data, + variable='cell_type', + sample_id_col='donor_id', + chromsizes=chromsizes, + bed_path=os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files'), + bigwig_path=os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bw_files'), + path_to_fragments=fragments_dict, + n_cpu=10, + temp_dir=os.path.join(out_dir, 'consensus_peak_calling/tmp'), + split_pattern='-', + ) + with open(os.path.join(out_dir, 'consensus_peak_calling/bed_paths.tsv'), 'wt') as f: + for v in bed_paths: + _ = f.write(f'{v}\t{bed_paths[v]}\n') + +# Load paths to pseudobulked samples +bed_paths = {} +with open(os.path.join(out_dir, 'consensus_peak_calling/bed_paths.tsv')) as f: + for line in f: + v, p = line.strip().split("\t") + bed_paths.update({v: p}) + +# Call peaks using MACS2 +narrow_peak_dict = peak_calling( + macs_path = 'macs2', + bed_paths=bed_paths, + outdir=os.path.join(os.path.join(out_dir, 'consensus_peak_calling/MACS')), + genome_size='hs', + n_cpu=10, + input_format='BEDPE', + shift=73, + ext_size=146, + keep_dup='all', + q_value=0.05 +) + +# Download list of blacklist regions +if not os.path.exists(os.path.join(out_dir, 'hg38-blacklist.v2.bed')): + print('Download list of blacklist regions') + url = 'https://raw.githubusercontent.com/aertslab/pycisTopic/d6a2f8c832c14faae07def1d3ad8755531f50ad5/blacklist/hg38-blacklist.v2.bed' + response = requests.get(url) + with open(os.path.join(out_dir, 'hg38-blacklist.v2.bed'), 'w') as f: + f.write(response.text) + +# Consensus peak calling +consensus_peaks = get_consensus_peaks( + narrow_peaks_dict=narrow_peak_dict, + peak_half_width=250, + chromsizes=chromsizes, + path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed') +) +consensus_peaks.to_bed( + path=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), + keep=True, + compression='infer', + chain=False +) + +# Download TSS annotations +os.makedirs(os.path.join(out_dir, 'qc'), exist_ok=True) +if not os.path.exists(os.path.join(out_dir, 'qc', 'tss.bed')): + subprocess.run([ + 'pycistopic', 'tss', 'get_tss', + '--output', os.path.join(out_dir, 'qc', 'tss.bed'), + '--name', 'hsapiens_gene_ensembl', + '--to-chrom-source', 'ucsc', + '--ucsc', 'hg38' + ]) + +# Create cistopic objects +if not os.path.exists(os.path.join(out_dir, 'cistopic_obj.pkl')): + + if par['qc']: # Whether to perform quality control + + # Compute QC metrics + print('Perform QC') + for donor_id in unique_donor_ids: + filepath = os.path.join(atac_dir, f'{donor_id}.tsv.gz') + subprocess.run([ + 'pycistopic', 'qc', + '--fragments', os.path.join(out_dir, 'qc', 'tss.bed'), + '--regions', os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), + '--tss', os.path.join(out_dir, 'qc', 'tss.bed'), + '--output', os.path.join(out_dir, 'qc', donor_id), + '--tss_flank_window', '10000', # Default: 2000 + '--tss_smoothing_rolling_window', '60', # Default: 10 + '--tss_minimum_signal_window', '5', # Default: 100 + '--tss_window', '25', # Default: 50 + '--tss_min_norm', '0.5', # Default: 0.2 + '--min_fragments_per_cb', '30', # Default: 10 + '--threads', '10' + ]) + + # Filter out low quality cells + print('Filter out low quality cells') + sample_id_to_barcodes_passing_filters = {} + sample_id_to_thresholds = {} + for donor_id in fragments_dict.keys(): + ( + sample_id_to_barcodes_passing_filters[donor_id], + sample_id_to_thresholds[donor_id] + ) = get_barcodes_passing_qc_for_sample( + sample_id=donor_id, + pycistopic_qc_output_dir=os.path.join(out_dir, 'qc'), + unique_fragments_threshold=None, # use automatic thresholding + tss_enrichment_threshold=None, # use automatic thresholding + frip_threshold=0, + use_automatic_thresholds=True, + ) + + # Create cistopic objects + print('Create cisTopic objects') + cistopic_obj_list = [] + for donor_id in unique_donor_ids: + sample_metrics = pl.read_parquet( + os.path.join(out_dir, 'qc', donor_id, f'{donor_id}.fragments_stats_per_cb.parquet') + ).to_pandas().set_index('CB').loc[sample_id_to_barcodes_passing_filters[sample_id]] + cistopic_obj = create_cistopic_object_from_fragments( + path_to_fragments=fragments_dict[donor_id], + path_to_regions=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), + path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed'), + metrics=sample_metrics, + valid_bc=sample_id_to_barcodes_passing_filters[sample_id], + n_cpu=10, + project=donor_id, + split_pattern='-' + ) + cistopic_obj_list.append(cistopic_obj) + else: + + # Skip QC and create cisTopic objects + print('Create cisTopic objects without QC') + cistopic_obj_list = [] + for donor_id in unique_donor_ids: + cistopic_obj = create_cistopic_object_from_fragments( + path_to_fragments=fragments_dict[donor_id], + path_to_regions=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), + path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed'), + n_cpu=10, + project=donor_id, + split_pattern='-' + ) + cistopic_obj_list.append(cistopic_obj) + + # Add metadata to cistopic objects + for i in range(len(cistopic_obj_list)): + cistopic_obj_list[i].add_cell_data(cell_data, split_pattern='-') + + # Infer doublets using scrublet + print('Infer doublets using scrublet') + for i in range(len(cistopic_obj_list)): + scrub = scr.Scrublet(cistopic_obj_list[i].fragment_matrix.T, expected_doublet_rate=0.1) + doublet_scores, predicted_doublets = scrub.scrub_doublets() + scrub.plot_histogram(); + scrub.call_doublets(threshold=0.22) + scrub.plot_histogram(); + scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], columns=cistopic_obj.cell_names, index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T + cistopic_obj_list[i].add_cell_data(scrublet, split_pattern='-') + + # Combine samples into a single cistopic object + if len(cistopic_obj_list) == 1: + cistopic_obj = cistopic_obj_list[0] + else: + cistopic_obj = merge(cistopic_obj_list, is_acc=1, copy=False, split_pattern='-') + + # Save cistopic objects + with open(os.path.join(out_dir, 'cistopic_obj.pkl'), 'wb') as f: + pickle.dump(cistopic_obj, f) +else: + # Load cistopic objects + with open(os.path.join(out_dir, 'cistopic_obj.pkl'), 'rb') as f: + cistopic_obj = pickle.load(f) + +# Download Mallet +MALLET_PATH = os.path.join(out_dir, 'Mallet-202108', 'bin', 'mallet') +if not os.path.exists(MALLET_PATH): + url = 'https://github.com/mimno/Mallet/releases/download/v202108/Mallet-202108-bin.tar.gz' + response = requests.get(url) + with open(os.path.join(out_dir, 'Mallet-202108-bin.tar.gz'), 'wb') as f: + f.write(response.content) + with tarfile.open(os.path.join(out_dir, 'Mallet-202108-bin.tar.gz'), 'r:gz') as f: + f.extractall(path=out_dir) + +# LDA-based topic modeling +print('Run LDA models') +if not os.path.exists(par['pycistopic_object']): + + # Topic modeling + n_topics = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] + if os.path.exists(MALLET_PATH): + models = run_cgs_models_mallet( + cistopic_obj, + n_topics=n_topics, + n_cpu=par['num_workers'], + n_iter=500, + random_state=555, + alpha=50, + alpha_by_topic=True, + eta=0.1, + eta_by_topic=False, + mallet_path=MALLET_PATH + ) + else: + print('Could not find Mallet. Running the sequential version of LDA instead.') + models = run_cgs_models( + cistopic_obj, + n_topics=n_topics, + n_cpu=12, + n_iter=500, + random_state=555, + alpha=50, + alpha_by_topic=True, + eta=0.1, + eta_by_topic=False + ) + + # Model selection + model = evaluate_models(models, select_model=40, return_model=True) + cistopic_obj.add_LDA_model(model) + + with open(filepath, 'wb') as f: + pickle.dump(cistopic_obj, f) +else: + with open(filepath, 'rb') as f: + cistopic_obj = pickle.load(f) + +chromsizes = pd.read_table(os.path.join(out_dir, "qc", "hg38.chrom_sizes_and_alias.tsv")) +chromsizes.rename({"# ucsc": "Chromosome", "length": "End"}, axis = 1, inplace = True) +chromsizes['Start'] = 0 +chromsizes = pr.PyRanges(chromsizes[['Chromosome', 'Start', 'End']]) + +# Find clusters +find_clusters( + cistopic_obj, + target='cell', + k=10, + res=[0.6, 1.2, 3], + scale=True, + split_pattern='-' +) + +# 2D projections +run_umap(cistopic_obj, target='cell', scale=True) +run_tsne(cistopic_obj, target='cell', scale=True) + +# Topic binarization +region_bin_topics_top_3k = binarize_topics(cistopic_obj, method='ntop', ntop=3_000) +region_bin_topics_otsu = binarize_topics(cistopic_obj, method='otsu') +binarized_cell_topic = binarize_topics(cistopic_obj, target='cell', method='li') + +# Topic annotation +topic_annot = topic_annotation( + cistopic_obj, + annot_var='cell_type', + binarized_cell_topic=binarized_cell_topic, + general_topic_thr=0.2 +) + +# Identify differentially accessible regions +imputed_acc_obj = impute_accessibility( + cistopic_obj, + selected_cells=None, + selected_regions=None, + scale_factor=10**6 +) +normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4) +variable_regions = find_highly_variable_features( + normalized_imputed_acc_obj, + min_disp=0.05, + min_mean=0.0125, + max_mean=3, + max_disp=np.inf, + n_bins=20, + n_top_features=None, + plot=False +) +markers_dict = find_diff_features( + cistopic_obj, + imputed_acc_obj, + variable='cell_type', + var_features=variable_regions, + contrasts=None, + adjpval_thr=0.05, + log2fc_thr=np.log2(1.5), + n_cpu=5, + split_pattern='-' +) + +# Save topics +folder = os.path.join(out_dir, 'region_sets', 'Topics_otsu') +os.makedirs(folder, exist_ok=True) +for topic in binarized_cell_topic: + region_names_to_coordinates( + region_bin_topics_otsu[topic].index + ).sort_values( + ['Chromosome', 'Start', 'End'] + ).to_csv( + os.path.join(folder, f'{topic}.bed'), + sep='\t', + header=False, index=False + ) +folder = os.path.join(out_dir, 'region_sets', 'Topics_top_3k') +os.makedirs(folder, exist_ok=True) +for topic in region_bin_topics_top_3k: + region_names_to_coordinates( + region_bin_topics_top_3k[topic].index + ).sort_values( + ['Chromosome', 'Start', 'End'] + ).to_csv( + os.path.join(folder, f'{topic}.bed'), + sep='\t', + header=False, index=False + ) +folder = os.path.join(out_dir, 'candidate_enhancers') +os.makedirs(folder, exist_ok=True) +with open(os.path.join(folder, 'region_bin_topics_otsu.pkl'), 'wb') as f: + pickle.dump(region_bin_topics_otsu, f) +with open(os.path.join(folder, 'region_bin_topics_top3k.pkl'), 'wb') as f: + pickle.dump(region_bin_topics_top_3k, f) + +# Save DARs +folder = os.path.join(out_dir, 'region_sets', 'DARs_cell_type') +os.makedirs(folder, exist_ok=True) +for cell_type in markers_dict: + region_names_to_coordinates( + markers_dict[cell_type].index + ).sort_values( + ['Chromosome', 'Start', 'End'] + ).to_csv( + os.path.join(folder, f'{cell_type}.bed'), + sep='\t', + header=False, + index=False + ) +folder = os.path.join(out_dir, 'candidate_enhancers') +with open(os.path.join(folder, 'markers_dict.pkl'), 'wb') as f: + pickle.dump(markers_dict, f) + +# Get gene activity +pr_annotation = pd.read_table( + os.path.join(out_dir, 'qc', 'tss.bed') +).rename( + {'Name': 'Gene', '# Chromosome': 'Chromosome'}, axis=1) +pr_annotation['Transcription_Start_Site'] = pr_annotation['Start'] +pr_annotation = pr.PyRanges(pr_annotation) +gene_act, weights = get_gene_activity( + imputed_acc_obj, + pr_annotation, + chromsizes, + use_gene_boundaries=True, + upstream=[1000, 100000], + downstream=[1000, 100000], + distance_weight=True, + decay_rate=1, + extend_gene_body_upstream=10000, + extend_gene_body_downstream=500, + gene_size_weight=False, + gene_size_scale_factor='median', + remove_promoters=False, + average_scores=True, + scale_factor=1, + extend_tss=[10, 10], + gini_weight = True, + return_weights= True +) + +# Infer differentially accessible genes +DAG_markers_dict= find_diff_features( + cistopic_obj, + gene_act, + variable='cell_type', + var_features=None, + contrasts=None, + adjpval_thr=0.05, + log2fc_thr=np.log2(1.5), + n_cpu=5, + split_pattern='-' +) + +# Exporting to loom +os.makedirs(os.path.join(out_dir, 'loom'), exist_ok=True) +cluster_markers = {'cell_type': markers_dict} +filepath = os.path.join(out_dir, 'loom', f'region-accessibility.loom') +print(f'Saving region accessibility to {filepath}') +export_region_accessibility_to_loom( + accessibility_matrix=imputed_acc_obj, + cistopic_obj=cistopic_obj, + binarized_topic_region=region_bin_topics_otsu, + binarized_cell_topic=binarized_cell_topic, + out_fname=filepath, + cluster_annotation=['cell_type'], + cluster_markers=cluster_markers, + nomenclature='hg38', + split_pattern='-' +) +filepath = os.path.join(out_dir, 'loom', f'gene-activity.loom') +print(f'Saving gene activity to {filepath}') +export_gene_activity_to_loom( + gene_activity_matrix=gene_act, + cistopic_obj=cistopic_obj, + out_fname=filepath, + cluster_annotation=['cell_type'], + cluster_markers=cluster_markers, + nomenclature='hg38', + split_pattern='-' +) + +work_dir = os.path.join(out_dir, 'scenicplus') +os.makedirs(work_dir, exist_ok=True) +======= 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', 'temp_dir': 'output/scenicplus', @@ -29,6 +581,7 @@ work_dir = par['temp_dir'] par['cistopic_out'] = f'{work_dir}/cistopic_out' par['cistopic_object'] = os.path.join(par['cistopic_out'], f'cistopic_object_with_model.pkl') +>>>>>>> dbe05111aa79f6bc2da8a8f0f10f06d8192b7bc4 os.makedirs(os.path.join(work_dir, 'scRNA'), exist_ok=True) # Download databases @@ -90,11 +643,11 @@ def download_checksum(url: str, filepath: str) -> str: adata_rna.write_h5ad(os.path.join(work_dir, 'rna.h5ad')) # Load candidate enhancer regions -with open(os.path.join(par['cistopic_out'], f'candidate_enhancers/region_bin_topics_otsu.pkl'), 'rb') as f: +with open(os.path.join(out_dir, f'candidate_enhancers/region_bin_topics_otsu.pkl'), 'rb') as f: region_bin_topics_otsu = pickle.load(f) -with open(os.path.join(par['cistopic_out'], f'candidate_enhancers/region_bin_topics_top3k.pkl'), 'rb') as f: +with open(os.path.join(out_dir, f'candidate_enhancers/region_bin_topics_top3k.pkl'), 'rb') as f: region_bin_topics_top3k = pickle.load(f) -with open(os.path.join(par['cistopic_out'], f'candidate_enhancers/markers_dict.pkl'), 'rb') as f: +with open(os.path.join(out_dir, f'candidate_enhancers/markers_dict.pkl'), 'rb') as f: markers_dict = pickle.load(f) @@ -121,21 +674,41 @@ def download_checksum(url: str, filepath: str) -> str: # Init scenicplus pipeline os.makedirs(os.path.join(work_dir, 'scplus_pipeline'), exist_ok=True) os.makedirs(os.path.join(work_dir, 'scplus_pipeline', 'temp'), exist_ok=True) -subprocess.run(['scenicplus', 'init_snakemake', '--temp_dir', os.path.join(work_dir, 'scplus_pipeline')]) +subprocess.run(['scenicplus', 'init_snakemake', '--out_dir', os.path.join(work_dir, 'scplus_pipeline')]) # Load pipeline settings with open(os.path.join(work_dir, 'scplus_pipeline', 'Snakemake', 'config', 'config.yaml'), 'r') as f: settings = yaml.safe_load(f) -# Update settings: indicate locations of input files +# Update settings settings['input_data']['cisTopic_obj_fname'] = par['cistopic_object'] settings['input_data']['GEX_anndata_fname'] = os.path.join(work_dir, 'rna.h5ad') -settings['input_data']['region_set_folder'] = os.path.join(par['cistopic_out'], 'region_sets') +settings['input_data']['region_set_folder'] = os.path.join(out_dir, 'region_sets') settings['input_data']['ctx_db_fname'] = rankings_db settings['input_data']['dem_db_fname'] = scores_db settings['input_data']['path_to_motif_annotations'] = motif_annotation settings['params_general']['temp_dir'] = os.path.join(work_dir, 'scplus_pipeline', 'temp') -settings['params_general']['n_cpu'] = 1 +settings['params_general']['n_cpu'] = par['num_workers'] +settings['params_inference']['quantile_thresholds_region_to_gene'] = '0.85 0.90 0.95' +settings['params_inference']['top_n_regionTogenes_per_gene'] = '5 10 15' +settings['params_inference']['region_to_gene_importance_method'] = 'GBM' +settings['params_inference']['tf_to_gene_importance_method'] = 'GBM' +settings['params_inference']['rho_threshold'] = 0.03 +settings['params_inference']['region_to_gene_correlation_method'] = 'SR' +settings['params_inference']['min_target_genes'] = 10 +settings['params_motif_enrichment']['species'] = 'homo_sapiens' +settings['params_motif_enrichment']['motif_similarity_fdr'] = 0.00001 +settings['params_motif_enrichment']['dem_adj_pval_thr'] = 0.05 +settings['params_motif_enrichment']['dem_log2fc_thr'] = 0.5 +settings['params_motif_enrichment']['dem_promoter_space'] = 1000 +settings['params_motif_enrichment']['dem_motif_hit_thr'] = 3.0 +settings['params_motif_enrichment']['dem_max_bg_regions'] = 3000 +settings['params_motif_enrichment']['ctx_auc_threshold'] = 0.005 +settings['params_motif_enrichment']['ctx_nes_threshold'] = 3.0 +settings['params_data_preparation']['is_multiome'] = True +settings['params_data_preparation']['is_multiome'] = 'key_to_group_by' +settings['params_data_preparation']['nr_cells_per_metacells'] = 5 +settings['params_data_preparation']['species'] = 'hsapiens' # Save pipeline settings with open(os.path.join(work_dir, 'scplus_pipeline', 'Snakemake', 'config', 'config.yaml'), 'w') as f: @@ -147,7 +720,7 @@ def download_checksum(url: str, filepath: str) -> str: with contextlib.chdir(os.path.join(work_dir, 'scplus_pipeline', 'Snakemake')): subprocess.run([ 'snakemake', - '--cores', '1', + '--cores', str(par['num_workers']), #'--unlock' ]) diff --git a/src/methods/multi_omics/scenicplus/test.sh b/src/methods/multi_omics/scenicplus/test.sh index 5ce845534..f57bce81d 100644 --- a/src/methods/multi_omics/scenicplus/test.sh +++ b/src/methods/multi_omics/scenicplus/test.sh @@ -1 +1,2 @@ -viash build src/methods/scenicplus/config.vsh.yaml --platform docker -o bin/scenicplus && bin/scenicplus/scenicplus --multiomics_atac resources/resources_test/grn-benchmark/multiomics_atac.h5ad --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction bin/scenicplus/prediction.csv --cistopic_out output/pycistopic --out_dir output/scenicplus +#viash build src/methods/multi_omics/scenicplus/config.novsh.yaml --platform docker -o bin/scenicplus && bin/scenicplus/scenicplus --multiomics_atac resources/resources_test/grn-benchmark/multiomics_atac.h5ad --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --temp_dir output/scenicplus --prediction output/scenicplus/prediction.csv --pycistopic_object output/scenicplus/pycistopic.pkl +docker run -v /mnt/c/Users/antoi/git/task_grn_inference/:/base apassemi/scenicplus python /base/src/methods/multi_omics/scenicplus/script.py diff --git a/src/methods/single_omics/ennet/config.novsh.yaml b/src/methods/single_omics/ennet/config.novsh.yaml index 8d1fc951e..b274f064d 100644 --- a/src/methods/single_omics/ennet/config.novsh.yaml +++ b/src/methods/single_omics/ennet/config.novsh.yaml @@ -20,8 +20,17 @@ functionality: required: false required: true must_exist: true + - name: --tfs + type: file + example: resources/prior/tf_all.csv + info: + label: tfs + summary: "List of putative TFs" + file_type: csv + required: true + must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/ennet/script.R b/src/methods/single_omics/ennet/script.R index 17f0df863..787e42c7a 100644 --- a/src/methods/single_omics/ennet/script.R +++ b/src/methods/single_omics/ennet/script.R @@ -5,6 +5,7 @@ library(dplyr) ## VIASH START par <- list( "multiomics_rna" = 'resources/resources_test/grn-benchmark/multiomics_rna.h5ad', + "tfs" = 'resources/prior/tf_all.csv', "prediction" = 'output/ennet/prediction.csv', "temp_dir": 'output/ennet', "max_n_links": 50000 @@ -13,12 +14,27 @@ par <- list( # input expression data ad <- anndata::read_h5ad(par$multiomics_rna) -X <- ad$X +X <- as.matrix(ad$X) +gene_names <- colnames(ad) + +# Remove genes with > 90% of zeros +zero_proportion <- colMeans(X == 0) +mask <- (zero_proportion <= 0.9) +X <- X[, mask] +gene_names <- gene_names[mask] + +# Remove samples with > 90% of zeros +zero_proportion <- rowMeans(X == 0) +mask <- (zero_proportion <= 0.9) +X <- X[mask,] + +# Load list of putative TFs +dat <- read.csv(par$tfs, header = FALSE) +Tf <- which(gene_names %in% dat$V1) # Run GRN inference method K <- matrix(0,nrow(X),ncol(X)) -Tf <- 1:ncol(X) -grn = ennet(E = X, K = K, Tf = Tf) +grn <- ennet(E = X, K = K, Tf = Tf) # Re-format output df <- as.data.frame(as.table(grn)) diff --git a/src/methods/single_omics/ennet/test.sh b/src/methods/single_omics/ennet/test.sh index 15cd6c794..dca2aff9c 100644 --- a/src/methods/single_omics/ennet/test.sh +++ b/src/methods/single_omics/ennet/test.sh @@ -1,2 +1,2 @@ -viash build src/methods/ennet/config.vsh.yaml -p docker -o bin/ennet && bin/ennet/ennet --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/ennet/prediction.csv -#viash run src/methods/ennet/config.vsh.yaml -p docker -- ---setup build && bin/ennet/ennet --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/ennet/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/ennet/config.novsh.yaml -p docker -o bin/ennet && bin/ennet/ennet --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/ennet/prediction.csv +#viash run src/methods/single_omics/ennet/config.novsh.yaml -p docker -- ---setup build && bin/ennet/ennet --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/ennet/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/genie3/config.novsh.yaml b/src/methods/single_omics/genie3/config.novsh.yaml index 341844c9f..97a02557f 100644 --- a/src/methods/single_omics/genie3/config.novsh.yaml +++ b/src/methods/single_omics/genie3/config.novsh.yaml @@ -20,8 +20,17 @@ functionality: required: false required: true must_exist: true + - name: --tfs + type: file + example: resources/prior/tf_all.csv + info: + label: tfs + summary: "List of putative TFs" + file_type: csv + required: true + must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/genie3/script.py b/src/methods/single_omics/genie3/script.py index 7fcfdbe2d..50b527bf1 100644 --- a/src/methods/single_omics/genie3/script.py +++ b/src/methods/single_omics/genie3/script.py @@ -10,6 +10,7 @@ ## VIASH START par = { 'multiomics_rna': 'resources/resources_test/grn-benchmark/multiomics_rna.h5ad', + 'tfs': 'resources/prior/tf_all.csv', 'prediction': 'output/genie3/prediction.csv', 'max_n_links': 50000 } @@ -19,11 +20,25 @@ # Load scRNA-seq data adata_rna = anndata.read_h5ad(par['multiomics_rna']) gene_names = adata_rna.var.gene_ids.index.to_numpy() -X = adata_rna.X +X = adata_rna.X.toarray() + +# Remove genes with >=90% of zeros +mask = (np.mean(X == 0, axis=0) >= 0.9) +X = X[:, ~mask] +gene_names = gene_names[~mask] + +# Remove samples with >=90% of zeros +mask = (np.mean(X == 0, axis=1) >= 0.9) +adata_rna = X[~mask, :] + +# Load list of putative TFs +df = pd.read_csv(par['tfs'], header=None, names=['gene_name']) +tfs = set(list(df['gene_name'])) +tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)] # GRN inference client = Client(processes=False) -network = genie3(X, client_or_address=client, gene_names=gene_names) +network = genie3(X, client_or_address=client, gene_names=gene_names, tf_names=tf_names) # Keep only top links network = network.head(par['max_n_links']) diff --git a/src/methods/single_omics/genie3/test.sh b/src/methods/single_omics/genie3/test.sh index 28c193c1b..d5a325f9f 100644 --- a/src/methods/single_omics/genie3/test.sh +++ b/src/methods/single_omics/genie3/test.sh @@ -1,2 +1 @@ -viash build src/methods/genie3/config.vsh.yaml -p docker -o bin/genie3 && bin/genie3/genie3 --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/genie3/prediction.csv -#viash run src/methods/genie3/config.vsh.yaml -p docker -- ---setup build && bin/genie3/genie3 --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/genie3/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/genie3/config.novsh.yaml -p docker -o bin/genie3 && bin/genie3/genie3 --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/genie3/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/grnboost2/config.novsh.yaml b/src/methods/single_omics/grnboost2/config.novsh.yaml index ee6e0239f..48fdff137 100644 --- a/src/methods/single_omics/grnboost2/config.novsh.yaml +++ b/src/methods/single_omics/grnboost2/config.novsh.yaml @@ -20,8 +20,17 @@ functionality: required: false required: true must_exist: true + - name: --tfs + type: file + example: resources/prior/tf_all.csv + info: + label: tfs + summary: "List of putative TFs" + file_type: csv + required: true + must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/grnboost2/script.py b/src/methods/single_omics/grnboost2/script.py index d278e6ad6..4ad59f8f2 100644 --- a/src/methods/single_omics/grnboost2/script.py +++ b/src/methods/single_omics/grnboost2/script.py @@ -10,6 +10,7 @@ ## VIASH START par = { 'multiomics_rna': 'resources/resources_test/grn-benchmark/multiomics_rna.h5ad', + 'tfs': 'resources/prior/tf_all.csv', 'prediction': 'output/grnboost2/prediction.csv', 'max_n_links': 50000 } @@ -19,11 +20,25 @@ # Load scRNA-seq data adata_rna = anndata.read_h5ad(par['multiomics_rna']) gene_names = adata_rna.var.gene_ids.index.to_numpy() -X = adata_rna.X +X = adata_rna.X.toarray() + +# Remove genes with >=90% of zeros +mask = (np.mean(X == 0, axis=0) >= 0.9) +X = X[:, ~mask] +gene_names = gene_names[~mask] + +# Remove samples with >=90% of zeros +mask = (np.mean(X == 0, axis=1) >= 0.9) +adata_rna = X[~mask, :] + +# Load list of putative TFs +df = pd.read_csv(par['tfs'], header=None, names=['gene_name']) +tfs = set(list(df['gene_name'])) +tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)] # GRN inference client = Client(processes=False) -network = grnboost2(X, client_or_address=client, gene_names=gene_names) +network = grnboost2(X, client_or_address=client, gene_names=gene_names, tf_names=tf_names) # Keep only top links network = network.head(par['max_n_links']) diff --git a/src/methods/single_omics/grnboost2/test.sh b/src/methods/single_omics/grnboost2/test.sh index 26991bf66..69d0f2715 100644 --- a/src/methods/single_omics/grnboost2/test.sh +++ b/src/methods/single_omics/grnboost2/test.sh @@ -1 +1 @@ -viash build src/methods/grnboost2/config.vsh.yaml -p docker -o bin/grnboost2 && bin/grnboost2/grnboost2 --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/grnboost2/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/grnboost2/config.novsh.yaml -p docker -o bin/grnboost2 && bin/grnboost2/grnboost2 --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/grnboost2/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/pidc/config.novsh.yaml b/src/methods/single_omics/pidc/config.novsh.yaml index 80db737e3..038ec66e9 100644 --- a/src/methods/single_omics/pidc/config.novsh.yaml +++ b/src/methods/single_omics/pidc/config.novsh.yaml @@ -21,7 +21,7 @@ functionality: required: true must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/pidc/test.sh b/src/methods/single_omics/pidc/test.sh index 8c47f3a0d..b0213daf6 100644 --- a/src/methods/single_omics/pidc/test.sh +++ b/src/methods/single_omics/pidc/test.sh @@ -1,2 +1 @@ -viash build src/methods/pidc/config.vsh.yaml -p docker -o bin/pidc && bin/pidc/pidc --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv -#viash run src/methods/pidc/config.vsh.yaml -p docker -- ---setup build && bin/pidc/pidc --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/pidc/config.novsh.yaml -p docker -o bin/pidc && bin/pidc/pidc --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/portia/config.novsh.yaml b/src/methods/single_omics/portia/config.novsh.yaml index 387620978..27312a521 100644 --- a/src/methods/single_omics/portia/config.novsh.yaml +++ b/src/methods/single_omics/portia/config.novsh.yaml @@ -20,8 +20,17 @@ functionality: required: false required: true must_exist: true + - name: --tfs + type: file + example: resources/prior/tf_all.csv + info: + label: tfs + summary: "List of putative TFs" + file_type: csv + required: true + must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/portia/script.py b/src/methods/single_omics/portia/script.py index b3600523e..49c219b06 100644 --- a/src/methods/single_omics/portia/script.py +++ b/src/methods/single_omics/portia/script.py @@ -2,6 +2,7 @@ import anndata import numpy as np +import pandas as pd import scipy.sparse import portia as pt @@ -9,6 +10,7 @@ ## VIASH START par = { 'multiomics_rna': 'resources/resources_test/grn-benchmark/multiomics_rna.h5ad', + 'tfs': 'resources/prior/tf_all.csv', 'prediction': 'output/portia/prediction.csv', 'max_n_links': 50000 } @@ -20,12 +22,28 @@ gene_names = adata_rna.var.gene_ids.index.to_numpy() X = adata_rna.X.toarray() if scipy.sparse.issparse(adata_rna.X) else adata_rna.X +# Remove genes with >=90% of zeros +mask = (np.mean(X == 0, axis=0) >= 0.9) +X = X[:, ~mask] +gene_names = gene_names[~mask] + +# Remove samples with >=90% of zeros +mask = (np.mean(X == 0, axis=1) >= 0.9) +adata_rna = X[~mask, :] + +# Load list of putative TFs +df = pd.read_csv(par['tfs'], header=None, names=['gene_name']) +tfs = set(list(df['gene_name'])) +tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)] + # GRN inference dataset = pt.GeneExpressionDataset() for exp_id, data in enumerate(X): dataset.add(pt.Experiment(exp_id, data)) M_bar = pt.run(dataset, method='no-transform') +print(M_bar.shape) + # Save inferred GRN with open(par['prediction'], 'w') as f: f.write(f',source,target,weight\n') diff --git a/src/methods/single_omics/portia/test.sh b/src/methods/single_omics/portia/test.sh index daa47dc1b..b8fd188fa 100644 --- a/src/methods/single_omics/portia/test.sh +++ b/src/methods/single_omics/portia/test.sh @@ -1,2 +1 @@ -viash build src/methods/portia/config.vsh.yaml -p docker -o bin/portia && bin/portia/portia --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/portia/prediction.csv -#viash run src/methods/portia/config.vsh.yaml -p docker -- ---setup build && bin/portia/portia --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/portia/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/portia/config.novsh.yaml -p docker -o bin/portia && bin/portia/portia --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/portia/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/ppcor/config.novsh.yaml b/src/methods/single_omics/ppcor/config.novsh.yaml index 6f818b85c..e74acb00f 100644 --- a/src/methods/single_omics/ppcor/config.novsh.yaml +++ b/src/methods/single_omics/ppcor/config.novsh.yaml @@ -21,7 +21,7 @@ functionality: required: true must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/ppcor/test.sh b/src/methods/single_omics/ppcor/test.sh index c13ad3c7b..12af88f79 100644 --- a/src/methods/single_omics/ppcor/test.sh +++ b/src/methods/single_omics/ppcor/test.sh @@ -1,2 +1 @@ -viash build src/methods/ppcor/config.vsh.yaml -p docker -o bin/ppcor && bin/ppcor/ppcor --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/ppcor/prediction.csv -#viash run src/methods/ppcor/config.vsh.yaml -p docker -- ---setup build && bin/ppcor/ppcor --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/ppcor/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/ppcor/config.novsh.yaml -p docker -o bin/ppcor && bin/ppcor/ppcor --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --prediction output/ppcor/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/scsgl/config.novsh.yaml b/src/methods/single_omics/scsgl/config.novsh.yaml index 16b92e294..dcc592973 100644 --- a/src/methods/single_omics/scsgl/config.novsh.yaml +++ b/src/methods/single_omics/scsgl/config.novsh.yaml @@ -21,7 +21,7 @@ functionality: required: true must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/scsgl/test.sh b/src/methods/single_omics/scsgl/test.sh index ab4cf3cd3..d8949131c 100644 --- a/src/methods/single_omics/scsgl/test.sh +++ b/src/methods/single_omics/scsgl/test.sh @@ -1,2 +1 @@ -viash build src/methods/scsgl/config.vsh.yaml -p docker -o bin/scsgl && bin/scsgl/scsgl --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/scsgl/prediction.csv -#viash run src/methods/scsgl/config.vsh.yaml -p docker -- ---setup build && bin/scsgl/scsgl --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/scsgl/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/scsgl/config.novsh.yaml -p docker -o bin/scsgl && bin/scsgl/scsgl --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --prediction output/scsgl/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/tigress/config.novsh.yaml b/src/methods/single_omics/tigress/config.novsh.yaml index 12ccc8154..5a3721256 100644 --- a/src/methods/single_omics/tigress/config.novsh.yaml +++ b/src/methods/single_omics/tigress/config.novsh.yaml @@ -20,8 +20,17 @@ functionality: required: false required: true must_exist: true + - name: --tfs + type: file + example: resources/prior/tf_all.csv + info: + label: tfs + summary: "List of putative TFs" + file_type: csv + required: true + must_exist: true - name: --prediction - __merge__: ../../api/file_prediction.yaml + __merge__: ../../../api/file_prediction.yaml required: true direction: output - name: --max_n_links diff --git a/src/methods/single_omics/tigress/script.R b/src/methods/single_omics/tigress/script.R index 220807dcf..114703150 100644 --- a/src/methods/single_omics/tigress/script.R +++ b/src/methods/single_omics/tigress/script.R @@ -5,6 +5,7 @@ library(dplyr) ## VIASH START par <- list( "multiomics_rna" = 'resources/resources_test/grn-benchmark/multiomics_rna.h5ad', + "tfs" = 'resources/prior/tf_all.csv', "prediction" = 'output/tigress/prediction.csv', "temp_dir": 'output/tigress', "max_n_links": 50000 @@ -13,10 +14,27 @@ par <- list( # input expression data ad <- anndata::read_h5ad(par$multiomics_rna) -inputExpr <- ad$X +X <- as.matrix(ad$X) +gene_names <- colnames(ad) + +# Remove genes with > 90% of zeros +zero_proportion <- colMeans(X == 0) +mask <- (zero_proportion <= 0.9) +X <- X[, mask] +gene_names <- gene_names[mask] +colnames(X) <- gene_names + +# Remove samples with > 90% of zeros +zero_proportion <- rowMeans(X == 0) +mask <- (zero_proportion <= 0.9) +X <- X[mask,] + +# Load list of putative TFs +dat <- read.csv(par$tfs, header = FALSE) +Tf <- intersect(gene_names, dat$V1) # Run GRN inference method -grn = tigress(inputExpr, allsteps=FALSE, verb=FALSE, usemulticore=TRUE) +grn = tigress(X, tflist = Tf, targetlist = gene_names, allsteps=FALSE, verb=FALSE, usemulticore=TRUE) # Re-format output df <- as.data.frame(as.table(grn)) diff --git a/src/methods/single_omics/tigress/test.sh b/src/methods/single_omics/tigress/test.sh index d2fcdbcfe..d600fb765 100644 --- a/src/methods/single_omics/tigress/test.sh +++ b/src/methods/single_omics/tigress/test.sh @@ -1,2 +1 @@ -viash build src/methods/tigress/config.vsh.yaml -p docker -o bin/tigress && bin/tigress/tigress --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/tigress/prediction.csv -#viash run src/methods/tigress/config.vsh.yaml -p docker -- ---setup build && bin/tigress/tigress --multiomics_rna resources/resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/tigress/prediction.csv \ No newline at end of file +viash build src/methods/single_omics/tigress/config.novsh.yaml -p docker -o bin/tigress && bin/tigress/tigress --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/tigress/prediction.csv \ No newline at end of file diff --git a/src/metrics/regression_2/config.vsh.yaml b/src/metrics/regression_2/config.vsh.yaml index e1252734b..4dba6b079 100644 --- a/src/metrics/regression_2/config.vsh.yaml +++ b/src/metrics/regression_2/config.vsh.yaml @@ -11,12 +11,20 @@ functionality: path: script.py - path: main.py arguments: + - name: --tfs + type: file + direction: input + must_exist: true + default: 'resources/prior/tf_all.csv' - name: --consensus type: file direction: input must_exist: true default: 'resources/prior/consensus-num-regulators.json' example: 'resources_test/prior/consensus-num-regulators.json' + - name: --static_only + type: boolean + default: true platforms: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index cc3dc3d58..24cc81c28 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple, Any, Union +from typing import Dict, List, Set, Tuple, Any, Union import random import json @@ -59,6 +59,9 @@ def cross_validate_gene( scores = np.abs(grn[:, j]) scores[j] = -1 selected_features = np.argsort(scores)[-n_features:] + selected_features = selected_features[scores[selected_features] > 0] + if len(selected_features) == 0: + return results assert j not in selected_features X_ = X[:, selected_features] @@ -96,44 +99,45 @@ def cross_validate_gene( return results -# def learn_background_distribution( -# estimator_t: str, -# X: np.ndarray, -# groups: np.ndarray, -# max_n_regulators: int, - -# ) -> Dict[int, Tuple[float, float]]: - -# rng = np.random.default_rng(seed=SEED) - -# n_genes = X.shape[1] -# random_grn = rng.random(size=(n_genes, n_genes)) - -# background = {} -# for n_features in tqdm.tqdm(range(1, max_n_regulators + 1), desc='Estimating background dist'): -# scores = [] -# for _ in range(N_POINTS_TO_ESTIMATE_BACKGROUND): -# j = rng.integers(low=0, high=n_genes) -# random_grn[:, j] = rng.random(size=n_genes) -# res = cross_validate_gene( -# estimator_t, -# X, -# groups, -# random_grn, -# j, -# n_features=n_features, -# random_state=SEED, -# n_jobs=n_jobs -# ) -# scores.append(res['avg-r2']) -# background[n_features] = (np.mean(scores), max(0.001, np.std(scores))) -# background['max'] = background[max_n_regulators] -# return background +def learn_background_distribution( + estimator_t: str, + X: np.ndarray, + groups: np.ndarray, + max_n_regulators: int, + +) -> Dict[int, Tuple[float, float]]: + + rng = np.random.default_rng(seed=SEED) + + n_genes = X.shape[1] + random_grn = rng.random(size=(n_genes, n_genes)) + + background = {} + for n_features in tqdm.tqdm(range(1, max_n_regulators + 1), desc='Estimating background dist'): + scores = [] + for _ in range(N_POINTS_TO_ESTIMATE_BACKGROUND): + j = rng.integers(low=0, high=n_genes) + random_grn[:, j] = rng.random(size=n_genes) + res = cross_validate_gene( + estimator_t, + X, + groups, + random_grn, + j, + n_features=n_features, + random_state=SEED, + n_jobs=n_jobs + ) + scores.append(res['avg-r2']) + background[n_features] = (np.mean(scores), max(0.001, np.std(scores))) + background['max'] = background[max_n_regulators] + return background def cross_validate( estimator_t: str, gene_names: np.ndarray, + tf_names: Set[str], X: np.ndarray, groups: np.ndarray, grn: np.ndarray, @@ -141,9 +145,15 @@ def cross_validate( n_jobs: int ) -> List[Dict[str, float]]: n_genes = len(grn) - + grn = fill_zeros_in_grn(grn) + + # Remove interactions when first gene in pair is not in TF list + for i, gene_name in tqdm.tqdm(enumerate(gene_names), desc=f'GRN preprocessing'): + if gene_name not in tf_names: + grn[i, :] = 0 + # Perform cross-validation for each gene results = [] for j in tqdm.tqdm(range(n_genes), desc=f'{estimator_t} CV'): res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j]),n_jobs=n_jobs) @@ -155,48 +165,65 @@ def cross_validate( } -# def dynamic_approach(grn: np.ndarray, X: np.ndarray, groups: np.ndarray, gene_names: List[str], reg_type: str) -> float: - -# # Determine maximum number of input features -# n_genes = X.shape[1] -# max_n_regulators = min(100, int(0.5 * n_genes)) +def dynamic_approach( + grn: np.ndarray, + X: np.ndarray, + groups: np.ndarray, + gene_names: List[str], + tf_names: Set[str], + reg_type: str +) -> float: -# # Learn background distribution for each value of `n_features`: -# # r2 scores using random genes as features. -# background = learn_background_distribution(reg_type, X, groups, max_n_regulators) + # Determine maximum number of input features + n_genes = X.shape[1] + max_n_regulators = min(100, int(0.5 * n_genes)) -# # Cross-validate each gene using the inferred GRN to define select input features -# res = cross_validate( -# reg_type, -# gene_names, -# X, -# groups, -# grn, -# np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators) -# ) + # Learn background distribution for each value of `n_features`: + # r2 scores using random genes as features. + background = learn_background_distribution(reg_type, X, groups, max_n_regulators) -# # Compute z-scores from r2 scores to take into account the fact -# # that random network can still perform well when the number of features is large -# scores = [] -# for j in range(n_genes): -# if np.isnan(res['scores'][j]['avg-r2']): -# continue -# n_features = int(np.sum(grn[:, j] != 0)) -# if n_features in background: -# mu, sigma = background[n_features] -# else: -# mu, sigma = background['max'] -# z_score = (res['scores'][j]['avg-r2'] - mu) / sigma -# z_score = max(0, z_score) -# scores.append(z_score) + # Cross-validate each gene using the inferred GRN to define select input features + res = cross_validate( + reg_type, + gene_names, + tf_names, + X, + groups, + grn, + np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators) + ) + + # Compute z-scores from r2 scores to take into account the fact + # that random network can still perform well when the number of features is large + scores = [] + for j in range(n_genes): + if np.isnan(res['scores'][j]['avg-r2']): + continue + n_features = int(np.sum(grn[:, j] != 0)) + if n_features in background: + mu, sigma = background[n_features] + else: + mu, sigma = background['max'] + z_score = (res['scores'][j]['avg-r2'] - mu) / sigma + z_score = max(0, z_score) + scores.append(z_score) -# return np.mean(scores) + return np.mean(scores) -def static_approach(grn: np.ndarray, n_features: np.ndarray, X: np.ndarray, groups: np.ndarray, gene_names: List[str], reg_type: str, n_jobs:int) -> float: +def static_approach( + grn: np.ndarray, + n_features: np.ndarray, + X: np.ndarray, + groups: np.ndarray, + gene_names: List[str], + tf_names: Set[str], + reg_type: str, + n_jobs:int +) -> float: # Cross-validate each gene using the inferred GRN to define select input features - res = cross_validate(reg_type, gene_names, X, groups, grn, n_features, n_jobs=n_jobs) + res = cross_validate(reg_type, gene_names, tf_names, X, groups, grn, n_features, n_jobs=n_jobs) mean_r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))]) mean_r2_scores = mean_r2_scores[mean_r2_scores>-10] @@ -235,13 +262,11 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: # Load inferred GRN print(f'Loading GRN', flush=True) - grn = pd.read_csv(par['prediction']) + grn = load_grn(par['prediction'], gene_names) # Load and standardize perturbation data layer = par['layer'] X = perturbation_data.layers[layer] - print(X.shape) - X = RobustScaler().fit_transform(X) # Load consensus numbers of putative regulators @@ -251,27 +276,34 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: n_features_theta_median = np.asarray([data[gene_name]['0.5'] for gene_name in gene_names], dtype=int) n_features_theta_max = np.asarray([data[gene_name]['1'] for gene_name in gene_names], dtype=int) + # Load list of putative TFs + df = pd.read_csv(par['tfs'], header=None, names=['gene_name']) + tf_names = set(list(df['gene_name'].to_numpy())) + # Evaluate GRN print(f'Compute metrics for layer: {layer}', flush=True) print(f'Dynamic approach:', flush=True) - # score_dynamic = dynamic_approach(grn, X, groups, gene_names, par['reg_type']) print(f'Static approach (theta=0):', flush=True) - score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, par['reg_type'], n_jobs=par['max_workers']) + score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers']) print(f'Static approach (theta=0.5):', flush=True) - score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, par['reg_type'], n_jobs=par['max_workers']) + score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers']) print(f'Static approach (theta=1):', flush=True) - score_static_max = static_approach(grn, n_features_theta_max, X, groups, gene_names, par['reg_type'], n_jobs=par['max_workers']) - # score_overall = score_dynamic + score_static_min + score_static_median + score_static_max + score_static_max = static_approach(grn, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['max_workers']) # TODO: find a mathematically sound way to combine Z-scores and r2 scores - # print(f'Score on {layer}: {score_overall}') results = { - 'static-theta-0.0': [score_static_min], - 'static-theta-0.5': [score_static_median], - 'static-theta-1.0': [score_static_max], - # 'dynamic': [score_dynamic], - # 'Overall': [score_overall], + 'static-theta-0.0': [float(score_static_min)], + 'static-theta-0.5': [float(score_static_median)], + 'static-theta-1.0': [float(score_static_max)], } + print(f'Scores on {layer}: {results}') + + # Add dynamic score + if not par['static_only']: + score_dynamic = dynamic_approach(grn, X, groups, gene_names, tf_names, par['reg_type']) + score_overall = score_dynamic + score_static_min + score_static_median + score_static_max + results['dynamic'] = [float(score_dynamic)] + results['Overall'] = [float(score_overall)] # Convert results to DataFrame df_results = pd.DataFrame(results) diff --git a/src/metrics/regression_2/script.py b/src/metrics/regression_2/script.py index ae33a43b1..70f12aefe 100644 --- a/src/metrics/regression_2/script.py +++ b/src/metrics/regression_2/script.py @@ -9,9 +9,11 @@ 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', 'layer': 'scgen_pearson', 'prediction': 'resources/grn_models/collectri.csv', + 'tfs': 'resources/prior/tf_all.csv', 'consensus': 'resources/grn-benchmark/consensus-num-regulators.json', 'score': 'output/score_regression2.csv', 'reg_type': 'ridge', + 'static_only': True } ## VIASH END @@ -32,16 +34,14 @@ metric_ids = output.columns.to_numpy() metric_values = output.values[0] -print(metric_ids.shape, metric_values.shape) output = ad.AnnData( X=np.empty((0, 0)), uns={ - "dataset_id": par["layer"], + "dataset_id": str(par["layer"]), "method_id": f"reg2-{par['method_id']}", "metric_ids": metric_ids, "metric_values": metric_values } ) - -output.write_h5ad(par["score"], compression="gzip") +output.write_h5ad(par['score'], compression='gzip') print('Completed', flush=True) diff --git a/src/metrics/regression_2/test.sh b/src/metrics/regression_2/test.sh new file mode 100644 index 000000000..97426f567 --- /dev/null +++ b/src/metrics/regression_2/test.sh @@ -0,0 +1 @@ +viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2 && bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --tfs resources/prior/tf_all.csv --consensus resources/prior/consensus-num-regulators.json --layer scgen_pearson --reg_type ridge --prediction resources/grn-benchmark/grn_models/scenicplus.csv \ No newline at end of file