-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
170 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from math import factorial | ||
import random | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
def optimize_gene_barcode(genes, codebook, df_exp, cycles = 15, trials=50, plot=True): | ||
""" | ||
Function to optimize how genes are divided over the possible barcodes. | ||
Given a binary code book, the function will randomly shuffle the genes and | ||
with the df_exp expression matrix it will sum the expression level of the | ||
genes that are co-labeled in the same cycle. For the number of trials it | ||
will return the permutation that has the lowest max expression in a cell | ||
type over all cycles. | ||
Args: | ||
genes (list): List of genes. | ||
codebook (array): Binary array containing the barcodes. Number of rows | ||
equals the number of genes, colums is the number of cycles. | ||
df_exp (dataframe): Dataframe containing the genes as rows and cell | ||
types as columns. Values could be mean expression or max expression. | ||
cycles (int): Number of cycles to optimize for. Usually same as barcode | ||
length. | ||
trials (int): Number of permutations of the genes to try. | ||
plot (bool): Plots summary statistics of all trials and best trial. | ||
Returns: | ||
best_gene_order (list): Order of genes that gives the lowest max | ||
expression over all cell types and cycles for teh given barcodes. | ||
results_dict(dict): Dictionary containing all results. | ||
Structure: results_dict[permutaion_number]: | ||
['gene_order', 'sum_counts', 'cycle_max', 'max'] | ||
'gene_order': Contains the tested gene order, | ||
'sum_count': Contains a dataframe with the summed counts of the | ||
co-labeled genes for all rounds. | ||
'cycle_max': Max of all cycles. | ||
'max': Max of full table. | ||
best_permutation (int) Number of the best permutation. | ||
Access data by: results_dict[best_permutation] | ||
""" | ||
n_genes = len(genes) | ||
try: | ||
print('With your gene list of {} genes, there are {:.2E} possible premutations, you are trying: {} of them.'.format(n_genes, factorial(n_genes), trials)) | ||
except OverflowError: | ||
print('With your gene list of {} genes, there are >10E256 possible premutations, you are trying: {} of them.'.format(n_genes, trials)) | ||
|
||
results_dict = {} | ||
|
||
for i in range(trials): | ||
results_dict[i] = {} | ||
|
||
#make dataframe to store the expression level for a round | ||
results = pd.DataFrame(np.zeros((cycles, df_exp.shape[1])), columns = df_exp.columns) | ||
|
||
#Shuffle the genes randomly and make it into an array | ||
shuffle_gene = np.array(random.sample(genes, len(genes))) | ||
results_dict[i]['gene_order'] = shuffle_gene | ||
|
||
#cycle over the cycles that have different genes in them | ||
for cycle in range(cycles): | ||
|
||
#genes that are simultaneously labeled in this round | ||
positive_genes = np.array(shuffle_gene)[codebook[:,cycle] == 1] | ||
|
||
#Get the sum of the expression | ||
sum_exp = df_exp.loc[positive_genes].sum() | ||
|
||
#Add it to the results df | ||
results.loc[cycle] = sum_exp | ||
|
||
#Add results to results_dict | ||
results_dict[i]['sum_counts'] = results | ||
|
||
#Find the maxima of all cycles and take the maximum of that | ||
maxima = results.max(axis=1).max() | ||
#Add the cycle maxima (this is the maximum summed expression in a single cell type of all genes labeled in this round) | ||
results_dict[i]['cycle_max'] = results.max(axis=1) | ||
|
||
results_dict[i]['max'] = results.max(axis=1).max() | ||
|
||
#Select permutation with lowest max expression | ||
maxima_data = {results_dict[i]['max'] : i for i in results_dict.keys()} | ||
best_max = min(maxima_data.keys()) | ||
best_permutation = maxima_data[best_max] | ||
best_gene_order = results_dict[best_permutation]['gene_order'] | ||
print('Found a gene order where the max expression over all cell types and cycles is: {} '.format(best_max)) | ||
|
||
if plot == True: | ||
fig = plt.figure(constrained_layout=True, figsize=(15,5)) | ||
gs = fig.add_gridspec(1, 5) | ||
ax1 = fig.add_subplot(gs[:, 0]) | ||
ax2 = fig.add_subplot(gs[:, 1:]) | ||
|
||
|
||
ax1.boxplot(maxima_data.keys()) | ||
ax1.scatter(1, best_max) | ||
ax1.set_title('Iteration results') | ||
ax1.set_ylabel('Max expression in dataset') | ||
ax1.set_xlabel('Blue dot is chosen permutation') | ||
|
||
#find lowest max expression | ||
ax2.bar(np.arange(1,16,1), results_dict[best_permutation]['cycle_max'], color='grey') | ||
ax2.boxplot(results_dict[best_permutation]['sum_counts']) | ||
ax2.set_title('Best permutation {}: Max count over cycles'.format(best_permutation)) | ||
ax2.set_ylabel('Max expression over cell types') | ||
ax2.set_xlabel('Barcoding cycle') | ||
|
||
return best_gene_order, results_dict, best_permutation |
This file was deleted.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
def select_genes(df, min_exp, max_exp, bad_genes): | ||
""" | ||
Select genes from cytograph cluster aggregate. | ||
Args: | ||
df (dataframe): Dataframe containing the clusterNames and their MarkerGenes | ||
min_exp (int): Minimal required expression. | ||
max_exp (int): Maximal allowed expression. | ||
bad_genes (list): List of genes that are not allowed. Use if the probe | ||
sequence generation program has difficulty designing probes for | ||
the gene of question. | ||
Returns: | ||
genes (set): Unique list of genes. | ||
""" | ||
genes = {} | ||
#make max expresion per gene dic | ||
max_exp_dict = dict(zip(ds.ra['Gene'], ds[:,:].max(axis=1))) | ||
|
||
for i in df.index: | ||
CN = df.loc[i, 'ClusterName'] | ||
ok=False | ||
list_expression = np.array([]) | ||
list_genes = [] | ||
|
||
for j in range(5): | ||
#Get gene name | ||
g = df.loc[i, 'MarkerGenes'].split(' ')[j] | ||
#Get expression in target cluster | ||
clust_exp = ds[np.where(ds.ra['Gene'] == g)[0][0], np.where(ds.ca['ClusterName'] == CN)[0][0]] | ||
if g not in bad_genes: | ||
list_expression = np.append(list_expression, clust_exp) | ||
list_genes.append(g) | ||
#Get max expression of all clusters | ||
max_exp_all = max_exp_dict[g] | ||
#Compare minimal expression with the cluster expression | ||
#Compare max expression with max expression for all clusters | ||
if min_exp < clust_exp and max_exp_all < max_exp and g not in bad_genes: | ||
ok = True | ||
break | ||
|
||
if ok == False: | ||
#Selection failed, pick gene that best matches the criteria. | ||
#Get gene that is closest to the middle of the min and max expression. | ||
diff = np.absolute(list_expression - ((max_exp - min_exp) /2)) | ||
index_best = np.where(diff == diff.min()) | ||
#Select best gene | ||
g = list_genes[index_best[0][0]] | ||
print('No marker for {}, best: {} with expression: {}, index {} --> {}'.format(CN, g, ds[np.where(ds.ra['Gene'] == g)[0][0], np.where(ds.ca['ClusterName'] == CN)[0][0]], index_best[0][0], np.round(list_expression,3))) | ||
|
||
#Add to gene set | ||
genes[CN] = g | ||
|
||
print('\nNumber of unique selected genes: {} {} options left'.format(len(set(genes.values())), 168-len(set(genes.values())))) | ||
return genes |
This file was deleted.
Oops, something went wrong.