-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
29 changed files
with
739 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
|
||
functionality: | ||
name: explanatory_analysis | ||
info: | ||
label: explanatory_analysis | ||
summary: "Explanatory analysis of inferred GRN to provide insights about network topology and annotations" | ||
|
||
arguments: | ||
- name: --perturbation_data | ||
type: file | ||
required: true | ||
direction: input | ||
- name: --tf_gene_net | ||
type: file | ||
required: true | ||
direction: input | ||
- name: --peak_gene_net | ||
type: file | ||
required: false | ||
direction: input | ||
- name: --annot_peak_database | ||
type: file | ||
required: true | ||
direction: input | ||
default: resources/grn-benchmark/supp/annot_peak_database.csv | ||
- name: --annot_gene_database | ||
type: file | ||
required: true | ||
direction: input | ||
default: resources/grn-benchmark/supp/annot_gene_database.csv | ||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
- path: helper.py | ||
platforms: | ||
- type: docker | ||
image: ghcr.io/openproblems-bio/base_python:1.0.4 | ||
setup: | ||
- type: python | ||
packages: [ ] | ||
|
||
- type: native | ||
- type: nextflow | ||
directives: | ||
label: [midtime,midmem,midcpu] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import os | ||
import pandas as pd | ||
import numpy as np | ||
import anndata as ad | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
def degree_centrality(net, source='source', target='target', normalize=False): | ||
counts = net.groupby(source)[target].nunique().values | ||
if normalize: | ||
total_targets = net[target].nunique() | ||
counts = counts/total_targets | ||
return counts | ||
|
||
def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs): | ||
# Step 1: Sort the data | ||
sorted_data = np.sort(data) | ||
|
||
# Step 2: Compute the cumulative density values | ||
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) | ||
|
||
# Step 3: Plot the data | ||
if ax is None: | ||
fig, ax = plt.subplots(1, 1, figsize=(4, 4)) | ||
else: | ||
fig = None | ||
ax.step(sorted_data, cdf, where='post', label=title, **kwdgs) | ||
ax.set_xlabel('Data') | ||
ax.set_ylabel('Cumulative Density') | ||
ax.set_title(title) | ||
# ax.grid(True) | ||
return fig, ax | ||
class Connectivity: | ||
def __init__(self, net, **kwargs): | ||
self.out_deg = degree_centrality(net, source='source', target='target', **kwargs) | ||
self.in_deg = degree_centrality(net, source='target', target='source', **kwargs) | ||
def calculate_hvgs_stats(geneset, hvgs, pert_genes): | ||
n_random = 1000 | ||
shared_genes = np.intersect1d(geneset, pert_genes) | ||
shared_hvgs_n = len(np.intersect1d(hvgs, shared_genes)) | ||
# shared_hvgs_ratio = shared_hvgs_n/len(shared_genes) | ||
|
||
# to percentile | ||
random_ratios = [] | ||
for i in range(n_random): | ||
random_genes = np.random.choice(pert_genes, shared_hvgs_n) | ||
random_ratios.append(len(np.intersect1d(hvgs, random_genes))/shared_hvgs_n) | ||
top_p = ((np.asarray(random_ratios)>shared_hvgs_ratio).sum()/n_random)*100 | ||
|
||
return {'n_included_hvgs':shared_hvgs_n, 'percentile_rank':round(top_p,1)} | ||
class Explanatory_analysis: | ||
''' | ||
This class provides functions for explanatory analysis of GRNs including topology analysis and biological annotaions. | ||
''' | ||
def __init__(self, net, peak_gene_net=None): | ||
self.net = net | ||
self.peak_gene_net = peak_gene_net | ||
#TODO: check the format. [source, target, weight] | ||
|
||
self.tfs = net.source.unique() | ||
self.targets = net.target.unique() | ||
if peak_gene_net is not None: | ||
self.cres = peak_gene_net.source.unique() | ||
# check duplicates | ||
dup_flag = False | ||
if 'cell_type' in net.columns: | ||
if net.duplicated(subset=['source','target','cell_type']).any(): | ||
dup_flag = True | ||
else: | ||
if net.duplicated(subset=['source','target']).any(): | ||
dup_flag = True | ||
if dup_flag: | ||
raise ValueError('The network has duplicated source target combinations.') | ||
self.peak_annot = None | ||
self.hvg_stats = None | ||
|
||
self.calculate_basic_stats() | ||
|
||
def calculate_all(self): | ||
'''Calculate all metrics | ||
''' | ||
self.calculate_basic_stats() | ||
self.calculate_centrality_stats() | ||
self.annotate_peaks() | ||
self.calculate_hvgs_stats() | ||
def calculate_basic_stats(self): | ||
self.n_links = self.net.shape[0] | ||
self.n_source = self.net.source.nunique() | ||
self.n_target = self.net.target.nunique() | ||
self.ratio_positive_negative = (self.net.weight>=0).sum()/(self.net.shape[0]) | ||
def calculate_centrality_stats(self) -> None: | ||
'''Calculate network centrality metrics such as in and out degree | ||
''' | ||
self.tf_gene = Connectivity(self.net, normalize=True) | ||
if self.peak_gene_net is None: | ||
self.peak_gene = None | ||
self.n_cres = None | ||
else: | ||
self.peak_gene = Connectivity(self.peak_gene_net, normalize=False) | ||
self.n_cres = self.peak_gene_net.source.nunique() | ||
def plot_cdf(self, values, ax=None, title=''): | ||
plot_cumulative_density(values, ax=ax, title=title) | ||
def plot_connectivity(self): | ||
data_list = [self.tf_gene.out_deg, self.tf_gene.in_deg] | ||
if self.peak_gene is not None: | ||
data_list += [self.peak_gene.in_deg] | ||
for data in data_list: | ||
plot_cumulative_density(data) | ||
def calculate_hvgs_stats(self, hvgs, pert_genes) -> dict[str, float]: | ||
''' Calculates metrics related to the inclusion of HVGs in the network. | ||
''' | ||
self.hvg_stats = calculate_hvgs_stats(self.targets, hvgs, pert_genes) | ||
return self.hvg_stats | ||
def annotate_peaks(self, annotation_df) -> dict[str, float]: | ||
'''Annotate peaks with associated regions on genome. | ||
''' | ||
peaks = self.format_peak(self.cres) | ||
annotation_df = annotation_df[annotation_df.peak.isin(peaks)] | ||
value_counts = annotation_df.annotation.value_counts() | ||
sum_values = value_counts.sum() | ||
value_ratio = ((value_counts/sum_values)*100).round(1) | ||
|
||
self.peak_annot = value_ratio.to_dict() | ||
return self.peak_annot | ||
@staticmethod | ||
def format_peak(peaks:list[str]) -> list[str]: | ||
'''Convert any peak data to chr:start-end | ||
''' | ||
import re | ||
formatted_peaks = [] | ||
for peak in peaks: | ||
chr_, start, end = re.split(r'[:\-_]', peak) | ||
peak = f"{chr_}:{start}-{end}" | ||
|
||
formatted_peaks.append(peak) | ||
return formatted_peaks | ||
def annotate_genes(self, gene_annotation_df) -> dict[str, float]: | ||
'''Annotates genes''' | ||
gene_annot = gene_annotation_df[gene_annotation_df.Gene.isin(self.targets)].Transcript_type.value_counts().to_dict() | ||
# gene_annot.values | ||
self.gene_annot = {key:round((value*100/len(self.targets)), 1) for key, value in gene_annot.items()} | ||
return self.gene_annot | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
viash run src/exp_analysis/config.vsh.yaml -- --tf_gene_net resources/grn-benchmark/grn_models/figr.csv --peak_gene_net resources/grn-benchmark/peak_gene_models/figr.csv --annot_peak_database resources/grn-benchmark/supp/annot_peak_database.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import pandas as pd | ||
import anndata as ad | ||
import sys | ||
|
||
## VIASH START | ||
par = { | ||
"perturbation_data": "resources/grn-benchmark/pertubation_data.h5ad", | ||
"tf_gene_net": "resources/grn-benchmark/grn_models/figr.csv", | ||
"peak_gene_net": "resources/grn-benchmark/peak_gene_models/figr.csv", | ||
"annot_peak_database": "resources/grn-benchmark/supp/annot_peak_database.csv", | ||
"annot_gene_database": "resources/grn-benchmark/supp/annot_gene_database.csv", | ||
"hvgs": "resources/grn-benchmark/supp/hvgs.txt" | ||
|
||
} | ||
# meta = { | ||
# "resources_dir":'resources' | ||
# } | ||
sys.path.append(meta["resources_dir"]) | ||
from helper import Explanatory_analysis | ||
|
||
## VIASH END | ||
print('Reading input files', flush=True) | ||
|
||
perturbation_data = pd.read_csv(par["perturbation_data"]) | ||
tf_gene_net = pd.read_csv(par["tf_gene_net"]) | ||
peak_gene_net = pd.read_csv(par["peak_gene_net"]) | ||
annot_peak_database = pd.read_csv(par["annot_peak_database"]) | ||
hvgs = pd.read_csv(par["hvgs"]) | ||
|
||
print("Create exp anal object") | ||
peak_gene_net['source'] = peak_gene_net['peak'] | ||
info_obj = Explanatory_analysis(net=tf_gene_net, peak_gene_net=peak_gene_net) | ||
print("Annotation of peaks") | ||
peak_annot = info_obj.annotate_peaks(annot_peak_database) | ||
# print("Annotation of genes") | ||
gene_annot = info_obj.annotate_genes(annot_gene_database) | ||
print("Topological analysis") | ||
info_obj.calculate_centrality_stats() | ||
peak_gene = info_obj.peak_gene.in_deg | ||
tf_gene_in = info_obj.tf_gene.in_deg | ||
tf_gene_out = info_obj.tf_gene.out_deg | ||
|
||
print("HVGs analysis") | ||
pert_genes = perturbation_data.var_names | ||
hvgs_info = info_obj.calculate_hvgs_stats(hvgs, pert_genes) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.