power estimation optional

pinellolab · Jan 3, 2019 · 57df481 · 57df481
1 parent af37ad3
commit 57df481
Show file tree

Hide file tree

Showing 5 changed files with 186 additions and 47 deletions.
diff --git a/SURF/command_line/CRISPR_SURF_Outputs.py b/SURF/command_line/CRISPR_SURF_Outputs.py
@@ -104,7 +104,7 @@ def crispr_surf_sgRNA_summary_table_update(sgRNA_summary_table, gammas2betas, av
 
 			f.write(','.join(map(str, row)) + '\n')
 
-def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, out_dir):
+def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, estimate_statistical_power, out_dir):
 	"""
 	Function to output total beta profile.
 	"""
@@ -118,18 +118,32 @@ def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, out_dir):
 	padj_min = np.min([float(x) for x in gammas2betas['padj'] if str(x) != 'nan' and float(x) > 0])
 	pvals_new = [float(x) if float(x) > 0 else p_min for x in pvals]
 	pvals_adj_new = [float(x) if float(x) > 0 else padj_min for x in pvals_adj]
-	power = gammas2betas['power']
+
+	if estimate_statistical_power == 'yes':
+		power = gammas2betas['power']
 
-	df = pd.DataFrame({
-		'Chr': chrom,
-		'Index': indices,
-		'Beta': betas,
-		'Pval.': pvals_new,
-		'Pval_adj.': pvals_adj_new,
-		'Statistical_Power': power
-		})
+		df = pd.DataFrame({
+			'Chr': chrom,
+			'Index': indices,
+			'Beta': betas,
+			'Pval.': pvals_new,
+			'Pval_adj.': pvals_adj_new,
+			'Statistical_Power': power
+			})
 
-	df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.','Statistical_Power'])
+		df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.','Statistical_Power'])
+
+	else:
+
+		df = pd.DataFrame({
+			'Chr': chrom,
+			'Index': indices,
+			'Beta': betas,
+			'Pval.': pvals_new,
+			'Pval_adj.': pvals_adj_new
+			})
+
+		df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.'])
 
 def crispr_surf_significant_regions(sgRNA_summary_table, gammas2betas, padj_cutoffs, scale, guideindices2bin, out_dir):
 
@@ -201,7 +215,7 @@ def crispr_surf_significant_regions(sgRNA_summary_table, gammas2betas, padj_cuto
 				if len(associated_sgRNAs) > 0:
 					f.write(','.join(map(str, [padj_cutoff, chrom, genomic_boundary_start, genomic_boundary_stop, significance_direction, signal_area, signal_mean, padj_mean, len(associated_sgRNAs), ','.join(map(str, associated_sgRNAs))])) + '\n')
 
-def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, scale, guideindices2bin, out_dir):
+def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, scale, guideindices2bin, estimate_statistical_power, out_dir):
 
 	"""
 	Function to output tracks to load on IGV.
@@ -265,27 +279,70 @@ def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, sca
 	# Output raw and deconvolved scores IGV track
 	dff = df_summary_table[pd.notnull(df_summary_table['Chr']) & pd.notnull(df_summary_table['Perturbation_Index']) & pd.notnull(df_summary_table['Raw_Signal_Combined']) & pd.notnull(df_summary_table['Deconvolved_Signal_Combined'])]
 
-	with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
+	# with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
 
-		for index, row in dff.iterrows():
+	# 	for index, row in dff.iterrows():
 
-			for r in range(1, replicates + 1):
-				raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
+	# 		for r in range(1, replicates + 1):
+	# 			raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
 
-		for index in range(len(gammas2betas['indices'])):
+	# 	for index in range(len(gammas2betas['indices'])):
 
-			if float(gammas2betas['padj'][index]) > 0:
-				neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
-			else:
-				neglog10_pval = -math.log10(padj_min)
+	# 		if float(gammas2betas['padj'][index]) > 0:
+	# 			neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
+	# 		else:
+	# 			neglog10_pval = -math.log10(padj_min)
+
+	# 		deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
+	# 		neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
+	# 		statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
+
+	if estimate_statistical_power == 'yes':
 
-			deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
-			neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
-			statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
+		with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
+
+			for index, row in dff.iterrows():
+
+				for r in range(1, replicates + 1):
+					raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
+
+			for index in range(len(gammas2betas['indices'])):
+
+				if float(gammas2betas['padj'][index]) > 0:
+					neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
+				else:
+					neglog10_pval = -math.log10(padj_min)
 
-	# Create IGV session
-	with open('/SURF/igv_session_template.xml', 'r') as f:
-		igv_template = f.read()
+				deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
+				neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
+				statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
+
+		# Create IGV session
+		with open('/SURF/igv_session_template.xml', 'r') as f:
+			igv_template = f.read()
+
+	else:
+
+		with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals:
+
+			for index, row in dff.iterrows():
+
+				for r in range(1, replicates + 1):
+					raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
+
+			for index in range(len(gammas2betas['indices'])):
+
+				if float(gammas2betas['padj'][index]) > 0:
+					neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
+				else:
+					neglog10_pval = -math.log10(padj_min)
+
+				deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
+				neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
+
+		# Create IGV session
+		with open('/SURF/igv_session_template_nopower.xml', 'r') as f:
+			igv_template = f.read()
 
 	igv_template = igv_template.replace('#genome#', str(genome))
 

diff --git a/SURF/command_line/CRISPR_SURF_Statistical_Significance.py b/SURF/command_line/CRISPR_SURF_Statistical_Significance.py
@@ -60,7 +60,7 @@ def crispr_surf_deconvolved_signal(gammas2betas, gamma_chosen, averaging_method,
 
 	return gammas2betas
 
-def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale):
+def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale, estimate_statistical_power):
 
 	"""
 	Function to assess the statistical significance of deconvolved genomic signal.
@@ -201,29 +201,30 @@ def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, per
 	new_p_cutoff = beta_pvals[pymin(range(len(beta_pvals_adj)), key=lambda i: pyabs(beta_pvals_adj[i] - float(padj_cutoffs[0])))]
 
 	# Estimate statistical power
-	beta_statistical_power = []
-	if scale > 1:
-		beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
+	if estimate_statistical_power == 'yes':
+		beta_statistical_power = []
+		if scale > 1:
+			beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
 
-	else:
-		beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
+		else:
+			beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
 
-	for i in range(len(beta_corrected_effect_size)):
+		for i in range(len(beta_corrected_effect_size)):
 
-		# shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]]
-		# percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0))
+			# shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]]
+			# percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0))
 
-		beta_dist_null = np.array(beta_distributions_null[i])
-		shifted_distribution = beta_dist_null + beta_corrected_effect_size[i]
-		percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0))
+			beta_dist_null = np.array(beta_distributions_null[i])
+			shifted_distribution = beta_dist_null + beta_corrected_effect_size[i]
+			percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0))
 
-		if (i + 1)%500 == 0:
-			logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions)))
+			if (i + 1)%500 == 0:
+				logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions)))
 
-		# beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution)))
+			# beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution)))
 
-		beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution)))
+			beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution)))
 
-	gammas2betas['power'] = beta_statistical_power
+		gammas2betas['power'] = beta_statistical_power
 
 	return gammas2betas, replicate_parameters
diff --git a/SURF/command_line/SURF_deconvolution.py b/SURF/command_line/SURF_deconvolution.py
@@ -36,6 +36,7 @@
 parser.add_argument('-corr', '--correlation', type = float, default = 0, help = 'The correlation between biological replicates to determine a reasonable lambda for the deconvolution operation. if 0 (default), the --characteristic_perturbation_range argument will be used to set an appropriate correlation.')
 parser.add_argument('-genome', '--genome', type = str, default = 'hg19', help = 'The genome to be used to create the IGV session file (hg19, hg38, mm9, mm10, etc.).')
 parser.add_argument('-effect_size', '--effect_size', type = float, default = 1, help = 'Effect size to estimate statistical power.')
+parser.add_argument('-estimate_power', '--estimate_statistical_power', type = str, choices = ['yes', 'no'], default = 'no', help = 'Whether or not to compute a track estimating statistical power for the CRISPR tiling screen data.')
 parser.add_argument('-padjs', '--padj_cutoffs', default = 0, nargs = '+', help = 'List of p-adj. (Benjamini-Hochberg) cut-offs for determining significance of regulatory regions in the CRISPR tiling screen.')
 parser.add_argument('-out_dir', '--out_directory', type = str, default = 'CRISPR_SURF_Analysis_%s' % (timestr), help = 'The name of the output directory to place CRISPR-SURF analysis files.')
 args = parser.parse_args()
@@ -56,6 +57,7 @@
 genome = args.genome
 padj_cutoffs = args.padj_cutoffs
 effect_size = args.effect_size
+estimate_statistical_power = args.estimate_statistical_power
 out_dir = args.out_directory
 
 ##### Create output directory
@@ -260,7 +262,7 @@
 
 ##### Bootstrap deconvolution analysis to assign statistical significance
 try:
-	gammas2betas_updated, replicate_parameters = crispr_surf_statistical_significance(sgRNA_summary_table = sgRNAs_summary_table, sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gammas2betas = gammas2betas_updated, null_distribution = null_distribution, simulation_n = simulation_n, test_type = test_type, guideindices2bin = guideindices2bin, averaging_method = averaging_method, padj_cutoffs = padj_cutoffs, effect_size = effect_size, limit = limit, scale = scale)
+	gammas2betas_updated, replicate_parameters = crispr_surf_statistical_significance(sgRNA_summary_table = sgRNAs_summary_table, sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gammas2betas = gammas2betas_updated, null_distribution = null_distribution, simulation_n = simulation_n, test_type = test_type, guideindices2bin = guideindices2bin, averaging_method = averaging_method, padj_cutoffs = padj_cutoffs, effect_size = effect_size, limit = limit, scale = scale, estimate_statistical_power = estimate_statistical_power)
 	logger.info('Finished simulations to assess statistical significance of deconvolution profile ...')
 
 except:
@@ -278,7 +280,7 @@
 
 ##### Output beta profile
 try:
-	complete_beta_profile(gammas2betas = gammas2betas_updated, simulation_n = simulation_n, padj_cutoffs = padj_cutoffs, out_dir = out_dir)
+	complete_beta_profile(gammas2betas = gammas2betas_updated, simulation_n = simulation_n, padj_cutoffs = padj_cutoffs, estimate_statistical_power = estimate_statistical_power, out_dir = out_dir)
 	logger.info('Successfully created beta profile file ...')
 
 except:
@@ -296,7 +298,7 @@
 
 ##### Output IGV tracks
 try:
-	crispr_surf_IGV(sgRNA_summary_table = sgRNAs_summary_table.split('/')[-1].replace('.csv', '_updated.csv'), gammas2betas = gammas2betas_updated, padj_cutoffs = padj_cutoffs, genome = genome, scale = scale, guideindices2bin = guideindices2bin, out_dir = out_dir)
+	crispr_surf_IGV(sgRNA_summary_table = sgRNAs_summary_table.split('/')[-1].replace('.csv', '_updated.csv'), gammas2betas = gammas2betas_updated, padj_cutoffs = padj_cutoffs, genome = genome, scale = scale, guideindices2bin = guideindices2bin, estimate_statistical_power = estimate_statistical_power, out_dir = out_dir)
 	logger.info('Successfully created IGV tracks ...')
 
 except:
@@ -321,6 +323,7 @@
 'correlation': correlation,
 'genome': genome,
 'effect_size': effect_size,
+'estimate_statistical_power': estimate_statistical_power,
 'padj_cutoffs': ' '.join(map(str, padj_cutoffs)),
 'out_directory': out_dir
 }