-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
run_celligner.py
149 lines (120 loc) · 5.8 KB
/
run_celligner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import celligner
import pandas as pd
import re
from taigapy import TaigaClient
tc = TaigaClient()
portal_in_dict = {}
portal_out_dict = {}
depmap_params = {'name': 'depmap_public_23q2',
'taiga_name': 'public-23q2-19de',
'taiga_file': 'OmicsExpressionProteinCodingGenesTPMLogp1',
'dset_type': 'model',
'mnn_params': None}
tcga_params = {'name': 'tcga',
'taiga_name': 'celligner-input-9827',
'taiga_file': 'tumor_expression',
'dset_type': 'tumor',
'mnn_params': None}
met500_params = {'name': 'met500',
'taiga_name': 'met500-fc3c',
'taiga_file': 'met500_TPM',
'dset_type': 'tumor',
'mnn_params': {'k1': 20, 'k2': 50}}
pdx_nv_params = {'name': 'pdx_novartis',
'taiga_name': 'pdx-data-3d29',
'taiga_file': 'Novartis_PDX_TPM',
'dset_type': 'model',
'mnn_params': {'k1': 10, 'k2': 50}}
pdx_ped_params = {'name': 'pdx_pediatric',
'taiga_name': 'pdx-data-3d29',
'taiga_file': 'pediatric_PDX_TPM',
'dset_type': 'model',
'mnn_params': {'k1': 10, 'k2': 50}}
celligner_default_extras = [met500_params, pdx_nv_params, pdx_ped_params]
def process_data(bg_df, contrast_df, extra_dfs):
# Filter to columns with ensembl id
# Need to transform from ensembl transcript ids to ensemble gene ids
# insert below
# Load HGNC gene set, filter to functional subset
hgnc_complete_set = tc.get(name='hgnc-87ab', version=7, file='hgnc_complete_set')
func_genes = hgnc_complete_set[~hgnc_complete_set.locus_group.isin(["non-coding RNA", "pseudogene"])]
# background datafrrame should have genes labeled by gene symbol
# need to reindex
# print('bg_genes going in:', bg_df.keys())
bg_genes = pd.Series(bg_df.keys()).apply(lambda s: re.search(r'^([\w.-]+) \(', s).group(1)).rename('symbol')
bg_genes = bg_genes.set_axis(bg_df.keys()).to_frame()
bg_genes = bg_genes.reset_index().merge(hgnc_complete_set[['symbol', 'ensembl_gene_id']],
left_on='symbol', right_on='symbol')[['index','ensembl_gene_id']].set_index('index')
# print('bg_genes processed:', bg_genes)
bg_df = bg_df.rename(columns=bg_genes.to_dict()['ensembl_gene_id'])
# print(bg_df.head())
gene_sets = [set(bg_df.columns), set(contrast_df.columns)] + [set(_df.columns) for _df in extra_dfs]
gene_set = set(func_genes.ensembl_gene_id).intersection(*gene_sets)
common_genes = [x for x in bg_df.columns if x in gene_set]
print('Common genes:', len(common_genes))
bg_df = bg_df[common_genes]
contrast_df = contrast_df[common_genes]
extra_dfs = [_df[common_genes] for _df in extra_dfs]
return bg_df, contrast_df, extra_dfs
def run_celligner(bg=depmap_params, contrast=tcga_params, extra_dsets=celligner_default_extras):
# load data to be used as the background and label the source
bg_df = tc.get(name=bg['taiga_name'], file=bg['taiga_file'])
bg_df = pd.concat({bg['name']: bg_df}, names=['source'])
# load data to be contrasted with the background and label the source
contrast_df = tc.get(name=contrast['taiga_name'], file=contrast['taiga_file'])
contrast_df = pd.concat({contrast['name']: contrast_df}, names=['source'])
# if there are additional datasets to be projected then collect and label them as well
extra_dfs = []
for dset in extra_dsets:
_df = tc.get(name=dset['taiga_name'], file=dset['taiga_file'])
if dset['name'] == 'pdx_novartis':
_df = _df.T
_df = pd.concat({dset['name']: _df}, names=['source'])
extra_dfs.append(_df)
'''print(bg_df.head())
print(contrast_df.head())
for _df in extra_dfs:
print(_df.head())'''
# make sure all datasets are using ensembl gene ids and are restricted to common sets of genes
bg_df, contrast_df, extra_dfs = process_data(bg_df, contrast_df, extra_dfs)
'''print(bg_df.head())
print(contrast_df.head())
for _df in extra_dfs:
print(_df.head())'''
# Create Celligner object and fit + transform the reference (depmap) and target (TCGA) expression datasets
my_celligner = celligner.Celligner()
my_celligner.fit(bg_df.droplevel(0,0))
my_celligner.transform(contrast_df.droplevel(0,0))
# add in additional datasets to be projected if they are given
for _df in extra_dfs:
df_name = _df.index.get_level_values(0).unique()[0]
for dset in extra_dsets:
if dset['name'] == df_name:
break
my_celligner.makeNewReference()
if dset['mnn_params']:
p = dset['mnn_params']
my_celligner.mnn_kwargs.update({'k1': p['k1'], 'k2': p['k2']})
my_celligner.transform(_df.droplevel(0,0), compute_cPCs=False)
# Compute UMAP, clusters and tumor - model distance
model_ids = list(bg_df.index.get_level_values(1))
tumor_ids = list(contrast_df.index.get_level_values(1))
for _df in extra_dfs:
df_name = _df.index.get_level_values(0).unique()[0]
for dset in extra_dsets:
if dset['name'] == df_name:
break
if dset['dset_type'] == 'model':
model_ids += list(_df.index.get_level_values(1))
elif dset['dset_type'] == 'tumor':
tumor_ids += list(_df.index.get_level_values(1))
my_celligner.computeMetricsForOutput(model_ids=model_ids, tumor_ids=tumor_ids)
outname = 'celligner_output_' \
+ bg['name'] + '_' \
+ contrast['name'] + '_' \
+ '_'.join([d['name'] for d in extra_dsets]) + ".pkl"
my_celligner.save(outname)
print('Model saved to: ', outname)
return outname
if __name__ == "__main__":
run_celligner()