Skip to content

Commit

Permalink
updated
Browse files Browse the repository at this point in the history
  • Loading branch information
Mossi8 committed Dec 15, 2021
1 parent ccd69bc commit 6bbed73
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 20 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# oligopy

## 1. Install oligopy components
# 1. Install oligopy components

Oligopy requires different programs: blast+ (command line blast) and other pip installable packages: primer3, pandas, numpy, joblib, biopython and pyensembl.

The installation of blast+ will require the creation of the desired databases for the comparison of each probe retrieved.

# 1.1. Install Blast Command Line Tool
## 1.1. Install Blast Command Line Tool
NCBI provides command line standalone BLAST+ programs (based on the NCBI C++ toolkit) as a single compressed package. The package is available for a variety of computer platforms (hardware/operating system combinations) at:
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/
The archives for Linux and Mac OSX are gzip-compressed tar files named using the following convention:
Expand Down Expand Up @@ -40,7 +40,7 @@ set BLASTDB=$HOME/blastdb
A better approach is to have the system automatically set these variables upon login, by modifying the .bash_profile or .cshrc file.
Once they are set, the system knows where to call BLAST programs, and the invoked program will know where to look for the database files. Note that with BLASTDB unspecified, BLAST+ programs only search the working directory, i.e. the directory where BLAST command is issued. For more details about configuring BLAST+, please see http://www.ncbi.nlm.nih.gov/books/NBK279695/.

# 1.2. Make blast database
## 1.2. Make blast database
Currently, oligopy only supports the transcriptome or genome with ENSEMBL format, so that the transcriptome and genome should be downloaded from the ensembl database in fasta format:

Mouse ncRNA and cDNA:
Expand All @@ -60,7 +60,7 @@ In the terminal, move to the blastdb folder and create both databases with the f
makeblastdb -in Mus_musculus_transcriptome_DB.fa -parse_seqids -dbtype nucl
makeblastdb -in Homo_sapiens_transcriptome_DB.fa -parse_seqids -dbtype nucl

## 2. Run oligopy: parameters.
# 2. Run oligopy: parameters.

Example run with default parameters: python oligopy.py -query codebookMouse448.xlsx -db Mus_musculus.fa -ncores 12 -db_species mouse -probe_type twist -out Probes

Expand Down
110 changes: 110 additions & 0 deletions barcode_gene_order.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pandas as pd
import numpy as np
from math import factorial
import random
import matplotlib.pyplot as plt


def optimize_gene_barcode(genes, codebook, df_exp, cycles = 15, trials=50, plot=True):
"""
Function to optimize how genes are divided over the possible barcodes.
Given a binary code book, the function will randomly shuffle the genes and
with the df_exp expression matrix it will sum the expression level of the
genes that are co-labeled in the same cycle. For the number of trials it
will return the permutation that has the lowest max expression in a cell
type over all cycles.
Args:
genes (list): List of genes.
codebook (array): Binary array containing the barcodes. Number of rows
equals the number of genes, colums is the number of cycles.
df_exp (dataframe): Dataframe containing the genes as rows and cell
types as columns. Values could be mean expression or max expression.
cycles (int): Number of cycles to optimize for. Usually same as barcode
length.
trials (int): Number of permutations of the genes to try.
plot (bool): Plots summary statistics of all trials and best trial.
Returns:
best_gene_order (list): Order of genes that gives the lowest max
expression over all cell types and cycles for teh given barcodes.
results_dict(dict): Dictionary containing all results.
Structure: results_dict[permutaion_number]:
['gene_order', 'sum_counts', 'cycle_max', 'max']
'gene_order': Contains the tested gene order,
'sum_count': Contains a dataframe with the summed counts of the
co-labeled genes for all rounds.
'cycle_max': Max of all cycles.
'max': Max of full table.
best_permutation (int) Number of the best permutation.
Access data by: results_dict[best_permutation]
"""
n_genes = len(genes)
try:
print('With your gene list of {} genes, there are {:.2E} possible premutations, you are trying: {} of them.'.format(n_genes, factorial(n_genes), trials))
except OverflowError:
print('With your gene list of {} genes, there are >10E256 possible premutations, you are trying: {} of them.'.format(n_genes, trials))

results_dict = {}

for i in range(trials):
results_dict[i] = {}

#make dataframe to store the expression level for a round
results = pd.DataFrame(np.zeros((cycles, df_exp.shape[1])), columns = df_exp.columns)

#Shuffle the genes randomly and make it into an array
shuffle_gene = np.array(random.sample(genes, len(genes)))
results_dict[i]['gene_order'] = shuffle_gene

#cycle over the cycles that have different genes in them
for cycle in range(cycles):

#genes that are simultaneously labeled in this round
positive_genes = np.array(shuffle_gene)[codebook[:,cycle] == 1]

#Get the sum of the expression
sum_exp = df_exp.loc[positive_genes].sum()

#Add it to the results df
results.loc[cycle] = sum_exp

#Add results to results_dict
results_dict[i]['sum_counts'] = results

#Find the maxima of all cycles and take the maximum of that
maxima = results.max(axis=1).max()
#Add the cycle maxima (this is the maximum summed expression in a single cell type of all genes labeled in this round)
results_dict[i]['cycle_max'] = results.max(axis=1)

results_dict[i]['max'] = results.max(axis=1).max()

#Select permutation with lowest max expression
maxima_data = {results_dict[i]['max'] : i for i in results_dict.keys()}
best_max = min(maxima_data.keys())
best_permutation = maxima_data[best_max]
best_gene_order = results_dict[best_permutation]['gene_order']
print('Found a gene order where the max expression over all cell types and cycles is: {} '.format(best_max))

if plot == True:
fig = plt.figure(constrained_layout=True, figsize=(15,5))
gs = fig.add_gridspec(1, 5)
ax1 = fig.add_subplot(gs[:, 0])
ax2 = fig.add_subplot(gs[:, 1:])


ax1.boxplot(maxima_data.keys())
ax1.scatter(1, best_max)
ax1.set_title('Iteration results')
ax1.set_ylabel('Max expression in dataset')
ax1.set_xlabel('Blue dot is chosen permutation')

#find lowest max expression
ax2.bar(np.arange(1,16,1), results_dict[best_permutation]['cycle_max'], color='grey')
ax2.boxplot(results_dict[best_permutation]['sum_counts'])
ax2.set_title('Best permutation {}: Max count over cycles'.format(best_permutation))
ax2.set_ylabel('Max expression over cell types')
ax2.set_xlabel('Barcoding cycle')

return best_gene_order, results_dict, best_permutation
8 changes: 0 additions & 8 deletions barcode_gene_order.py.webloc

This file was deleted.

Binary file added codebookMouse448.xlsx
Binary file not shown.
56 changes: 56 additions & 0 deletions gene_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
def select_genes(df, min_exp, max_exp, bad_genes):
"""
Select genes from cytograph cluster aggregate.
Args:
df (dataframe): Dataframe containing the clusterNames and their MarkerGenes
min_exp (int): Minimal required expression.
max_exp (int): Maximal allowed expression.
bad_genes (list): List of genes that are not allowed. Use if the probe
sequence generation program has difficulty designing probes for
the gene of question.
Returns:
genes (set): Unique list of genes.
"""
genes = {}
#make max expresion per gene dic
max_exp_dict = dict(zip(ds.ra['Gene'], ds[:,:].max(axis=1)))

for i in df.index:
CN = df.loc[i, 'ClusterName']
ok=False
list_expression = np.array([])
list_genes = []

for j in range(5):
#Get gene name
g = df.loc[i, 'MarkerGenes'].split(' ')[j]
#Get expression in target cluster
clust_exp = ds[np.where(ds.ra['Gene'] == g)[0][0], np.where(ds.ca['ClusterName'] == CN)[0][0]]
if g not in bad_genes:
list_expression = np.append(list_expression, clust_exp)
list_genes.append(g)
#Get max expression of all clusters
max_exp_all = max_exp_dict[g]
#Compare minimal expression with the cluster expression
#Compare max expression with max expression for all clusters
if min_exp < clust_exp and max_exp_all < max_exp and g not in bad_genes:
ok = True
break

if ok == False:
#Selection failed, pick gene that best matches the criteria.
#Get gene that is closest to the middle of the min and max expression.
diff = np.absolute(list_expression - ((max_exp - min_exp) /2))
index_best = np.where(diff == diff.min())
#Select best gene
g = list_genes[index_best[0][0]]
print('No marker for {}, best: {} with expression: {}, index {} --> {}'.format(CN, g, ds[np.where(ds.ra['Gene'] == g)[0][0], np.where(ds.ca['ClusterName'] == CN)[0][0]], index_best[0][0], np.round(list_expression,3)))

#Add to gene set
genes[CN] = g

print('\nNumber of unique selected genes: {} {} options left'.format(len(set(genes.values())), 168-len(set(genes.values()))))
return genes
8 changes: 0 additions & 8 deletions gene_selection.py.webloc

This file was deleted.

0 comments on commit 6bbed73

Please sign in to comment.