updated

linnarsson-lab · Dec 15, 2021 · 6bbed73 · 6bbed73
1 parent ccd69bc
commit 6bbed73
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # oligopy
 
-## 1. Install oligopy components
+# 1. Install oligopy components
 
 Oligopy requires different programs: blast+ (command line blast) and other pip installable packages: primer3, pandas, numpy, joblib, biopython and pyensembl.
 
 The installation of blast+ will require the creation of the desired databases for the comparison of each probe retrieved.
 
-# 1.1. Install Blast Command Line Tool
+## 1.1. Install Blast Command Line Tool
 NCBI provides command line standalone BLAST+ programs (based on the NCBI C++ toolkit) as a single compressed package. The package is available for a variety of computer platforms (hardware/operating system combinations) at:
 ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/
 The archives for Linux and Mac OSX are gzip-compressed tar files named using the following convention:
@@ -40,7 +40,7 @@ set BLASTDB=$HOME/blastdb
 A better approach is to have the system automatically set these variables upon login, by modifying the .bash_profile or .cshrc file.
 Once they are set, the system knows where to call BLAST programs, and the invoked program will know where to look for the database files. Note that with BLASTDB unspecified, BLAST+ programs only search the working directory, i.e. the directory where BLAST command is issued. For more details about configuring BLAST+, please see http://www.ncbi.nlm.nih.gov/books/NBK279695/.
 
-# 1.2. Make blast database
+## 1.2. Make blast database
 Currently, oligopy only supports the transcriptome or genome with ENSEMBL format, so that the transcriptome and genome should be downloaded from the ensembl database in fasta format:
 
 Mouse ncRNA and cDNA:
@@ -60,7 +60,7 @@ In the terminal, move to the blastdb folder and create both databases with the f
 makeblastdb -in Mus_musculus_transcriptome_DB.fa -parse_seqids -dbtype nucl
 makeblastdb -in Homo_sapiens_transcriptome_DB.fa -parse_seqids -dbtype nucl
 
-## 2. Run oligopy: parameters.
+# 2. Run oligopy: parameters.
 
 Example run with default parameters: python oligopy.py -query codebookMouse448.xlsx -db Mus_musculus.fa -ncores 12 -db_species mouse -probe_type twist -out Probes
 

diff --git a/barcode_gene_order.py b/barcode_gene_order.py
@@ -0,0 +1,110 @@
+import pandas as pd
+import numpy as np
+from math import factorial
+import random
+import matplotlib.pyplot as plt
+
+
+def optimize_gene_barcode(genes, codebook, df_exp, cycles = 15, trials=50, plot=True):
+    """
+    Function to optimize how genes are divided over the possible barcodes.
+    Given a binary code book, the function will randomly shuffle the genes and
+    with the df_exp expression matrix it will sum the expression level of the
+    genes that are co-labeled in the same cycle. For the number of trials it
+    will return the permutation that has the lowest max expression in a cell
+    type over all cycles. 
+    
+    Args:
+    genes (list): List of genes.
+    codebook (array): Binary array containing the barcodes. Number of rows
+        equals the number of genes, colums is the number of cycles.
+    df_exp (dataframe): Dataframe containing the genes as rows and cell
+        types as columns. Values could be mean expression or max expression.
+    cycles (int): Number of cycles to optimize for. Usually same as barcode
+        length.
+    trials (int): Number of permutations of the genes to try.
+    plot (bool): Plots summary statistics of all trials and best trial.
+    
+    Returns:
+    best_gene_order (list): Order of genes that gives the lowest max
+        expression over all cell types and cycles for teh given barcodes.
+    results_dict(dict): Dictionary containing all results.
+        Structure: results_dict[permutaion_number]:
+        ['gene_order', 'sum_counts', 'cycle_max', 'max']
+        'gene_order': Contains the tested gene order,
+        'sum_count': Contains a dataframe with the summed counts of the 
+            co-labeled genes for all rounds.
+        'cycle_max': Max of all cycles.
+        'max': Max of full table.
+    best_permutation (int) Number of the best permutation. 
+        Access data by: results_dict[best_permutation]
+    
+    """
+    n_genes = len(genes)
+    try:
+        print('With your gene list of {} genes, there are {:.2E} possible premutations, you are trying: {} of them.'.format(n_genes, factorial(n_genes), trials))
+    except OverflowError:
+        print('With your gene list of {} genes, there are >10E256 possible premutations, you are trying: {} of them.'.format(n_genes, trials))
+
+    results_dict = {}
+
+    for i in range(trials):
+        results_dict[i] = {}
+
+        #make dataframe to store the expression level for a round
+        results = pd.DataFrame(np.zeros((cycles, df_exp.shape[1])), columns = df_exp.columns)
+
+        #Shuffle the genes randomly and make it into an array
+        shuffle_gene = np.array(random.sample(genes, len(genes)))
+        results_dict[i]['gene_order'] = shuffle_gene
+
+        #cycle over the cycles that have different genes in them
+        for cycle in range(cycles):
+
+            #genes that are simultaneously labeled in this round
+            positive_genes = np.array(shuffle_gene)[codebook[:,cycle] == 1]
+
+            #Get the sum of the expression
+            sum_exp = df_exp.loc[positive_genes].sum()
+
+            #Add it to the results df
+            results.loc[cycle] = sum_exp
+
+        #Add results to results_dict
+        results_dict[i]['sum_counts'] = results
+
+        #Find the maxima of all cycles and take the maximum of that
+        maxima = results.max(axis=1).max()
+        #Add the cycle maxima (this is the maximum summed expression in a single cell type of all genes labeled in this round)
+        results_dict[i]['cycle_max'] = results.max(axis=1)
+
+        results_dict[i]['max'] = results.max(axis=1).max()
+
+    #Select permutation with lowest max expression
+    maxima_data = {results_dict[i]['max'] : i for i in results_dict.keys()}
+    best_max = min(maxima_data.keys())
+    best_permutation = maxima_data[best_max]
+    best_gene_order = results_dict[best_permutation]['gene_order']
+    print('Found a gene order where the max expression over all cell types and cycles is: {} '.format(best_max))
+
+    if plot == True:
+        fig = plt.figure(constrained_layout=True, figsize=(15,5))
+        gs = fig.add_gridspec(1, 5)
+        ax1 = fig.add_subplot(gs[:, 0])
+        ax2 = fig.add_subplot(gs[:, 1:])
+
+
+        ax1.boxplot(maxima_data.keys())
+        ax1.scatter(1, best_max)
+        ax1.set_title('Iteration results')
+        ax1.set_ylabel('Max expression in dataset')
+        ax1.set_xlabel('Blue dot is chosen permutation')
+
+        #find lowest max expression
+        ax2.bar(np.arange(1,16,1), results_dict[best_permutation]['cycle_max'], color='grey')
+        ax2.boxplot(results_dict[best_permutation]['sum_counts'])
+        ax2.set_title('Best permutation {}: Max count over cycles'.format(best_permutation))
+        ax2.set_ylabel('Max expression over cell types')
+        ax2.set_xlabel('Barcoding cycle')
+
+    return best_gene_order, results_dict, best_permutation
diff --git a/barcode_gene_order.py.webloc b/barcode_gene_order.py.webloc
diff --git a/codebookMouse448.xlsx b/codebookMouse448.xlsx
diff --git a/gene_selection.py b/gene_selection.py
@@ -0,0 +1,56 @@
+def select_genes(df, min_exp, max_exp, bad_genes):
+    """
+    Select genes from cytograph cluster aggregate.
+    
+    Args:
+    df (dataframe): Dataframe containing the clusterNames and their MarkerGenes
+    min_exp (int): Minimal required expression.
+    max_exp (int): Maximal allowed expression.
+    bad_genes (list): List of genes that are not allowed. Use if the probe
+        sequence generation program has difficulty designing probes for 
+        the gene of question. 
+        
+    Returns:
+    genes (set): Unique list of genes.
+    
+    """
+    genes = {}
+    #make max expresion per gene dic
+    max_exp_dict = dict(zip(ds.ra['Gene'], ds[:,:].max(axis=1)))
+
+    for i in df.index:
+        CN = df.loc[i, 'ClusterName']
+        ok=False
+        list_expression = np.array([])
+        list_genes = []
+
+        for j in range(5):
+            #Get gene name
+            g = df.loc[i, 'MarkerGenes'].split(' ')[j]
+            #Get expression in target cluster
+            clust_exp = ds[np.where(ds.ra['Gene'] == g)[0][0], np.where(ds.ca['ClusterName'] == CN)[0][0]]
+            if g not in bad_genes:
+                list_expression = np.append(list_expression, clust_exp)
+                list_genes.append(g)
+            #Get max expression of all clusters
+            max_exp_all = max_exp_dict[g]
+            #Compare minimal expression with the cluster expression
+            #Compare max expression with max expression for all clusters
+            if min_exp < clust_exp and max_exp_all < max_exp and g not in bad_genes:
+                ok = True
+                break
+
+        if ok == False:
+            #Selection failed, pick gene that best matches the criteria.
+            #Get gene that is closest to the middle of the min and max expression.
+            diff = np.absolute(list_expression - ((max_exp - min_exp) /2))
+            index_best = np.where(diff == diff.min())
+            #Select best gene
+            g = list_genes[index_best[0][0]]
+            print('No marker for {}, best: {} with expression: {}, index {} --> {}'.format(CN, g, ds[np.where(ds.ra['Gene'] == g)[0][0], np.where(ds.ca['ClusterName'] == CN)[0][0]], index_best[0][0], np.round(list_expression,3)))
+
+        #Add to gene set
+        genes[CN] = g
+
+    print('\nNumber of unique selected genes: {}  {} options left'.format(len(set(genes.values())), 168-len(set(genes.values()))))    
+    return genes
diff --git a/gene_selection.py.webloc b/gene_selection.py.webloc