Skip to content

Commit

Permalink
before merge
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Jul 30, 2024
1 parent b13d114 commit 5bc6fc0
Show file tree
Hide file tree
Showing 29 changed files with 739 additions and 91 deletions.
4 changes: 2 additions & 2 deletions scripts/download_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ set -e
echo ">> Downloading resources"

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources/grn/" \
--output "resources" \
--input "s3://openproblems-data/resources/grn/grn-benchmark" \
--output "resources/grn-benchmark" \
--delete


45 changes: 10 additions & 35 deletions scripts/generate_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
echo ">> download raw data"
aws s3 sync s3://openproblems-data/resources/grn/datasets_raw ./resources/datasets_raw/ --delete

echo ">> Create test datasets for sc perturbation and multiome"
viash run src/process_data/test_data_counts/config.novsh.yaml

echo ">> process multiome"
viash run src/process_data/multiome/config.vsh.yaml -- --multiome_counts resources/datasets_raw/multiome_counts.h5ad \
--multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
--multiomics_atac resources/grn-benchmark/multiomics_atac.h5ad
# python src/process_data/multiome/script.py
viash run src/process_data/multiomics/multiome/config.vsh.yaml

echo ">> process perturbation data"
viash run src/process_data/perturbation/config.vsh.yaml -- --perturbation_counts resources/datasets_raw/perturbation_counts.h5ad --perturbation_data resources/grn-benchmark/perturbation_data.h5ad
viash run src/process_data/perturbation/sc_counts/config.vsh.yaml


# echo ">> Perturbation data: batch correction" TODO"

Expand All @@ -24,56 +25,30 @@ viash run src/process_data/perturbation/config.vsh.yaml -- --perturbation_counts
# echo ">> process supp: TODO"

echo ">> Extract matrix and annotations from multiome "
viash run src/process_data/multiome_matrix/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
--multiomics_atac resources/grn-benchmark/multiomics_atac.h5ad \
--rna_matrix output/scRNA/X_matrix.mtx \
--atac_matrix output/scATAC/X_matrix.mtx \
--rna_gene_annot output/scRNA/annotation_gene.csv \
--rna_cell_annot output/scRNA/annotation_cell.csv \
--atac_peak_annot output/scATAC/annotation_gene.csv \
--atac_cell_annot output/scATAC/annotation_cell.csv
# python src/process_data/multiome_matrix/script.py
viash run src/process_data/multiomics/multiome_matrix/config.vsh.yaml


echo ">> Construct rds files from multiomics count matrix and annotations"
viash run src/process_data/multiome_r/config.vsh.yaml -- --rna_matrix output/scRNA/X_matrix.mtx \
--atac_matrix output/scATAC/X_matrix.mtx \
--rna_gene_annot output/scRNA/annotation_gene.csv \
--rna_cell_annot output/scRNA/annotation_cell.csv \
--atac_peak_annot output/scATAC/annotation_gene.csv \
--atac_cell_annot output/scATAC/annotation_cell.csv \
--rna_rds resources/grn-benchmark/multiomics_r/rna.rds \
--atac_rds resources/grn-benchmark/multiomics_r/atac.rds
# Rscript src/process_data/multiome_r/script.R
viash run src/process_data/multiomics/multiome_r/config.vsh.yaml


echo ">> create test resources "
viash run src/process_data/multiome_matrix/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
--multiomics_rna_test resources_test/grn-benchmark/multiomics_rna.h5ad \
--multiomics_atac resources/grn-benchmark/multiomics_atac.h5ad \
--multiomics_atac_test resources_test/grn-benchmark/multiomics_atac.h5ad \
--perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
--perturbation_data resources_test/grn-benchmark/perturbation_data.h5ad
# python src/process_data/test_data/script.py

echo ">> Extract matrix and annotations from multiome: test data "
viash run src/process_data/multiome_matrix/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad \
viash run src/process_data/multiomics/multiome_matrix/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad \
--multiomics_atac resources_test/grn-benchmark/multiomics_atac.h5ad \
--rna_matrix output/scRNA/X_matrix.mtx \
--atac_matrix output/scATAC/X_matrix.mtx \
--rna_gene_annot output/scRNA/annotation_gene.csv \
--rna_cell_annot output/scRNA/annotation_cell.csv \
--atac_peak_annot output/scATAC/annotation_gene.csv \
--atac_cell_annot output/scATAC/annotation_cell.csv
# python src/process_data/multiome_matrix/script.py

echo ">> Construct rds files from multiomics count matrix and annotations: test data"
viash run src/process_data/multiome_r/config.vsh.yaml -- --rna_matrix output/scRNA/X_matrix.mtx \
viash run src/process_data/multiomics/multiome_r/config.vsh.yaml -- --rna_matrix output/scRNA/X_matrix.mtx \
--atac_matrix output/scATAC/X_matrix.mtx \
--rna_gene_annot output/scRNA/annotation_gene.csv \
--rna_cell_annot output/scRNA/annotation_cell.csv \
--atac_peak_annot output/scATAC/annotation_gene.csv \
--atac_cell_annot output/scATAC/annotation_cell.csv \
--rna_rds resources_test/grn-benchmark/multiomics_r/rna.rds \
--atac_rds resources_test/grn-benchmark/multiomics_r/atac.rds
# Rscript src/process_data/multiome_r/script.R
46 changes: 46 additions & 0 deletions src/exp_analysis/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

functionality:
name: explanatory_analysis
info:
label: explanatory_analysis
summary: "Explanatory analysis of inferred GRN to provide insights about network topology and annotations"

arguments:
- name: --perturbation_data
type: file
required: true
direction: input
- name: --tf_gene_net
type: file
required: true
direction: input
- name: --peak_gene_net
type: file
required: false
direction: input
- name: --annot_peak_database
type: file
required: true
direction: input
default: resources/grn-benchmark/supp/annot_peak_database.csv
- name: --annot_gene_database
type: file
required: true
direction: input
default: resources/grn-benchmark/supp/annot_gene_database.csv

resources:
- type: python_script
path: script.py
- path: helper.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [ ]

- type: native
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
144 changes: 144 additions & 0 deletions src/exp_analysis/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import os
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt


def degree_centrality(net, source='source', target='target', normalize=False):
counts = net.groupby(source)[target].nunique().values
if normalize:
total_targets = net[target].nunique()
counts = counts/total_targets
return counts

def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs):
# Step 1: Sort the data
sorted_data = np.sort(data)

# Step 2: Compute the cumulative density values
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)

# Step 3: Plot the data
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=(4, 4))
else:
fig = None
ax.step(sorted_data, cdf, where='post', label=title, **kwdgs)
ax.set_xlabel('Data')
ax.set_ylabel('Cumulative Density')
ax.set_title(title)
# ax.grid(True)
return fig, ax
class Connectivity:
def __init__(self, net, **kwargs):
self.out_deg = degree_centrality(net, source='source', target='target', **kwargs)
self.in_deg = degree_centrality(net, source='target', target='source', **kwargs)
def calculate_hvgs_stats(geneset, hvgs, pert_genes):
n_random = 1000
shared_genes = np.intersect1d(geneset, pert_genes)
shared_hvgs_n = len(np.intersect1d(hvgs, shared_genes))
# shared_hvgs_ratio = shared_hvgs_n/len(shared_genes)

# to percentile
random_ratios = []
for i in range(n_random):
random_genes = np.random.choice(pert_genes, shared_hvgs_n)
random_ratios.append(len(np.intersect1d(hvgs, random_genes))/shared_hvgs_n)
top_p = ((np.asarray(random_ratios)>shared_hvgs_ratio).sum()/n_random)*100

return {'n_included_hvgs':shared_hvgs_n, 'percentile_rank':round(top_p,1)}
class Explanatory_analysis:
'''
This class provides functions for explanatory analysis of GRNs including topology analysis and biological annotaions.
'''
def __init__(self, net, peak_gene_net=None):
self.net = net
self.peak_gene_net = peak_gene_net
#TODO: check the format. [source, target, weight]

self.tfs = net.source.unique()
self.targets = net.target.unique()
if peak_gene_net is not None:
self.cres = peak_gene_net.source.unique()
# check duplicates
dup_flag = False
if 'cell_type' in net.columns:
if net.duplicated(subset=['source','target','cell_type']).any():
dup_flag = True
else:
if net.duplicated(subset=['source','target']).any():
dup_flag = True
if dup_flag:
raise ValueError('The network has duplicated source target combinations.')
self.peak_annot = None
self.hvg_stats = None

self.calculate_basic_stats()

def calculate_all(self):
'''Calculate all metrics
'''
self.calculate_basic_stats()
self.calculate_centrality_stats()
self.annotate_peaks()
self.calculate_hvgs_stats()
def calculate_basic_stats(self):
self.n_links = self.net.shape[0]
self.n_source = self.net.source.nunique()
self.n_target = self.net.target.nunique()
self.ratio_positive_negative = (self.net.weight>=0).sum()/(self.net.shape[0])
def calculate_centrality_stats(self) -> None:
'''Calculate network centrality metrics such as in and out degree
'''
self.tf_gene = Connectivity(self.net, normalize=True)
if self.peak_gene_net is None:
self.peak_gene = None
self.n_cres = None
else:
self.peak_gene = Connectivity(self.peak_gene_net, normalize=False)
self.n_cres = self.peak_gene_net.source.nunique()
def plot_cdf(self, values, ax=None, title=''):
plot_cumulative_density(values, ax=ax, title=title)
def plot_connectivity(self):
data_list = [self.tf_gene.out_deg, self.tf_gene.in_deg]
if self.peak_gene is not None:
data_list += [self.peak_gene.in_deg]
for data in data_list:
plot_cumulative_density(data)
def calculate_hvgs_stats(self, hvgs, pert_genes) -> dict[str, float]:
''' Calculates metrics related to the inclusion of HVGs in the network.
'''
self.hvg_stats = calculate_hvgs_stats(self.targets, hvgs, pert_genes)
return self.hvg_stats
def annotate_peaks(self, annotation_df) -> dict[str, float]:
'''Annotate peaks with associated regions on genome.
'''
peaks = self.format_peak(self.cres)
annotation_df = annotation_df[annotation_df.peak.isin(peaks)]
value_counts = annotation_df.annotation.value_counts()
sum_values = value_counts.sum()
value_ratio = ((value_counts/sum_values)*100).round(1)

self.peak_annot = value_ratio.to_dict()
return self.peak_annot
@staticmethod
def format_peak(peaks:list[str]) -> list[str]:
'''Convert any peak data to chr:start-end
'''
import re
formatted_peaks = []
for peak in peaks:
chr_, start, end = re.split(r'[:\-_]', peak)
peak = f"{chr_}:{start}-{end}"

formatted_peaks.append(peak)
return formatted_peaks
def annotate_genes(self, gene_annotation_df) -> dict[str, float]:
'''Annotates genes'''
gene_annot = gene_annotation_df[gene_annotation_df.Gene.isin(self.targets)].Transcript_type.value_counts().to_dict()
# gene_annot.values
self.gene_annot = {key:round((value*100/len(self.targets)), 1) for key, value in gene_annot.items()}
return self.gene_annot


1 change: 1 addition & 0 deletions src/exp_analysis/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
viash run src/exp_analysis/config.vsh.yaml -- --tf_gene_net resources/grn-benchmark/grn_models/figr.csv --peak_gene_net resources/grn-benchmark/peak_gene_models/figr.csv --annot_peak_database resources/grn-benchmark/supp/annot_peak_database.csv
48 changes: 48 additions & 0 deletions src/exp_analysis/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import anndata as ad
import sys

## VIASH START
par = {
"perturbation_data": "resources/grn-benchmark/pertubation_data.h5ad",
"tf_gene_net": "resources/grn-benchmark/grn_models/figr.csv",
"peak_gene_net": "resources/grn-benchmark/peak_gene_models/figr.csv",
"annot_peak_database": "resources/grn-benchmark/supp/annot_peak_database.csv",
"annot_gene_database": "resources/grn-benchmark/supp/annot_gene_database.csv",
"hvgs": "resources/grn-benchmark/supp/hvgs.txt"

}
# meta = {
# "resources_dir":'resources'
# }
sys.path.append(meta["resources_dir"])
from helper import Explanatory_analysis

## VIASH END
print('Reading input files', flush=True)

perturbation_data = pd.read_csv(par["perturbation_data"])
tf_gene_net = pd.read_csv(par["tf_gene_net"])
peak_gene_net = pd.read_csv(par["peak_gene_net"])
annot_peak_database = pd.read_csv(par["annot_peak_database"])
hvgs = pd.read_csv(par["hvgs"])

print("Create exp anal object")
peak_gene_net['source'] = peak_gene_net['peak']
info_obj = Explanatory_analysis(net=tf_gene_net, peak_gene_net=peak_gene_net)
print("Annotation of peaks")
peak_annot = info_obj.annotate_peaks(annot_peak_database)
# print("Annotation of genes")
gene_annot = info_obj.annotate_genes(annot_gene_database)
print("Topological analysis")
info_obj.calculate_centrality_stats()
peak_gene = info_obj.peak_gene.in_deg
tf_gene_in = info_obj.tf_gene.in_deg
tf_gene_out = info_obj.tf_gene.out_deg

print("HVGs analysis")
pert_genes = perturbation_data.var_names
hvgs_info = info_obj.calculate_hvgs_stats(hvgs, pert_genes)



5 changes: 3 additions & 2 deletions src/process_data/explanatory_analysis/hvgs/config.novsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ functionality:

platforms:
- type: docker
image: janursa/figr:19-08-2024
image: ghcr.io/openproblems-bio/base_images/r:1.1.0
setup:
- type: r
packages: [scry]
bioc: [scry]
packages: [zellkonverter]



Expand Down
1 change: 1 addition & 0 deletions src/process_data/explanatory_analysis/hvgs/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
viash run src/process_data/explanatory_analysis/hvgs/config.novsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
--multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
--hvgs resources/grn-benchmark/supp/hvgs.csv


Expand Down
1 change: 0 additions & 1 deletion src/process_data/explanatory_analysis/hvgs/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ multiomics_genes <- rownames(multiomics_rna)

# Subset adata to keep only the genes present in multiomics_rna
adata <- adata[rownames(adata) %in% multiomics_genes, ]
print(adata)

adata_sce = devianceFeatureSelection(adata, assay="X", batch=colData(adata)$plate_name)

Expand Down
Loading

0 comments on commit 5bc6fc0

Please sign in to comment.