Skip to content

Commit

Permalink
power estimation optional
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonathan Y. Hsu authored and Jonathan Y. Hsu committed Jan 3, 2019
1 parent af37ad3 commit 57df481
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 47 deletions.
111 changes: 84 additions & 27 deletions SURF/command_line/CRISPR_SURF_Outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def crispr_surf_sgRNA_summary_table_update(sgRNA_summary_table, gammas2betas, av

f.write(','.join(map(str, row)) + '\n')

def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, out_dir):
def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, estimate_statistical_power, out_dir):
"""
Function to output total beta profile.
"""
Expand All @@ -118,18 +118,32 @@ def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, out_dir):
padj_min = np.min([float(x) for x in gammas2betas['padj'] if str(x) != 'nan' and float(x) > 0])
pvals_new = [float(x) if float(x) > 0 else p_min for x in pvals]
pvals_adj_new = [float(x) if float(x) > 0 else padj_min for x in pvals_adj]
power = gammas2betas['power']

if estimate_statistical_power == 'yes':
power = gammas2betas['power']

df = pd.DataFrame({
'Chr': chrom,
'Index': indices,
'Beta': betas,
'Pval.': pvals_new,
'Pval_adj.': pvals_adj_new,
'Statistical_Power': power
})
df = pd.DataFrame({
'Chr': chrom,
'Index': indices,
'Beta': betas,
'Pval.': pvals_new,
'Pval_adj.': pvals_adj_new,
'Statistical_Power': power
})

df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.','Statistical_Power'])
df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.','Statistical_Power'])

else:

df = pd.DataFrame({
'Chr': chrom,
'Index': indices,
'Beta': betas,
'Pval.': pvals_new,
'Pval_adj.': pvals_adj_new
})

df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.'])

def crispr_surf_significant_regions(sgRNA_summary_table, gammas2betas, padj_cutoffs, scale, guideindices2bin, out_dir):

Expand Down Expand Up @@ -201,7 +215,7 @@ def crispr_surf_significant_regions(sgRNA_summary_table, gammas2betas, padj_cuto
if len(associated_sgRNAs) > 0:
f.write(','.join(map(str, [padj_cutoff, chrom, genomic_boundary_start, genomic_boundary_stop, significance_direction, signal_area, signal_mean, padj_mean, len(associated_sgRNAs), ','.join(map(str, associated_sgRNAs))])) + '\n')

def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, scale, guideindices2bin, out_dir):
def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, scale, guideindices2bin, estimate_statistical_power, out_dir):

"""
Function to output tracks to load on IGV.
Expand Down Expand Up @@ -265,27 +279,70 @@ def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, sca
# Output raw and deconvolved scores IGV track
dff = df_summary_table[pd.notnull(df_summary_table['Chr']) & pd.notnull(df_summary_table['Perturbation_Index']) & pd.notnull(df_summary_table['Raw_Signal_Combined']) & pd.notnull(df_summary_table['Deconvolved_Signal_Combined'])]

with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
# with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:

for index, row in dff.iterrows():
# for index, row in dff.iterrows():

for r in range(1, replicates + 1):
raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
# for r in range(1, replicates + 1):
# raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')

for index in range(len(gammas2betas['indices'])):
# for index in range(len(gammas2betas['indices'])):

if float(gammas2betas['padj'][index]) > 0:
neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
else:
neglog10_pval = -math.log10(padj_min)
# if float(gammas2betas['padj'][index]) > 0:
# neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
# else:
# neglog10_pval = -math.log10(padj_min)

# deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
# neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
# statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')

if estimate_statistical_power == 'yes':

deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:

for index, row in dff.iterrows():

for r in range(1, replicates + 1):
raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')

for index in range(len(gammas2betas['indices'])):

if float(gammas2betas['padj'][index]) > 0:
neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
else:
neglog10_pval = -math.log10(padj_min)

# Create IGV session
with open('/SURF/igv_session_template.xml', 'r') as f:
igv_template = f.read()
deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')

# Create IGV session
with open('/SURF/igv_session_template.xml', 'r') as f:
igv_template = f.read()

else:

with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals:

for index, row in dff.iterrows():

for r in range(1, replicates + 1):
raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')

for index in range(len(gammas2betas['indices'])):

if float(gammas2betas['padj'][index]) > 0:
neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
else:
neglog10_pval = -math.log10(padj_min)

deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')

# Create IGV session
with open('/SURF/igv_session_template_nopower.xml', 'r') as f:
igv_template = f.read()

igv_template = igv_template.replace('#genome#', str(genome))

Expand Down
35 changes: 18 additions & 17 deletions SURF/command_line/CRISPR_SURF_Statistical_Significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def crispr_surf_deconvolved_signal(gammas2betas, gamma_chosen, averaging_method,

return gammas2betas

def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale):
def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale, estimate_statistical_power):

"""
Function to assess the statistical significance of deconvolved genomic signal.
Expand Down Expand Up @@ -201,29 +201,30 @@ def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, per
new_p_cutoff = beta_pvals[pymin(range(len(beta_pvals_adj)), key=lambda i: pyabs(beta_pvals_adj[i] - float(padj_cutoffs[0])))]

# Estimate statistical power
beta_statistical_power = []
if scale > 1:
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
if estimate_statistical_power == 'yes':
beta_statistical_power = []
if scale > 1:
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)

else:
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
else:
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)

for i in range(len(beta_corrected_effect_size)):
for i in range(len(beta_corrected_effect_size)):

# shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]]
# percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0))
# shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]]
# percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0))

beta_dist_null = np.array(beta_distributions_null[i])
shifted_distribution = beta_dist_null + beta_corrected_effect_size[i]
percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0))
beta_dist_null = np.array(beta_distributions_null[i])
shifted_distribution = beta_dist_null + beta_corrected_effect_size[i]
percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0))

if (i + 1)%500 == 0:
logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions)))
if (i + 1)%500 == 0:
logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions)))

# beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution)))
# beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution)))

beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution)))
beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution)))

gammas2betas['power'] = beta_statistical_power
gammas2betas['power'] = beta_statistical_power

return gammas2betas, replicate_parameters
9 changes: 6 additions & 3 deletions SURF/command_line/SURF_deconvolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
parser.add_argument('-corr', '--correlation', type = float, default = 0, help = 'The correlation between biological replicates to determine a reasonable lambda for the deconvolution operation. if 0 (default), the --characteristic_perturbation_range argument will be used to set an appropriate correlation.')
parser.add_argument('-genome', '--genome', type = str, default = 'hg19', help = 'The genome to be used to create the IGV session file (hg19, hg38, mm9, mm10, etc.).')
parser.add_argument('-effect_size', '--effect_size', type = float, default = 1, help = 'Effect size to estimate statistical power.')
parser.add_argument('-estimate_power', '--estimate_statistical_power', type = str, choices = ['yes', 'no'], default = 'no', help = 'Whether or not to compute a track estimating statistical power for the CRISPR tiling screen data.')
parser.add_argument('-padjs', '--padj_cutoffs', default = 0, nargs = '+', help = 'List of p-adj. (Benjamini-Hochberg) cut-offs for determining significance of regulatory regions in the CRISPR tiling screen.')
parser.add_argument('-out_dir', '--out_directory', type = str, default = 'CRISPR_SURF_Analysis_%s' % (timestr), help = 'The name of the output directory to place CRISPR-SURF analysis files.')
args = parser.parse_args()
Expand All @@ -56,6 +57,7 @@
genome = args.genome
padj_cutoffs = args.padj_cutoffs
effect_size = args.effect_size
estimate_statistical_power = args.estimate_statistical_power
out_dir = args.out_directory

##### Create output directory
Expand Down Expand Up @@ -260,7 +262,7 @@

##### Bootstrap deconvolution analysis to assign statistical significance
try:
gammas2betas_updated, replicate_parameters = crispr_surf_statistical_significance(sgRNA_summary_table = sgRNAs_summary_table, sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gammas2betas = gammas2betas_updated, null_distribution = null_distribution, simulation_n = simulation_n, test_type = test_type, guideindices2bin = guideindices2bin, averaging_method = averaging_method, padj_cutoffs = padj_cutoffs, effect_size = effect_size, limit = limit, scale = scale)
gammas2betas_updated, replicate_parameters = crispr_surf_statistical_significance(sgRNA_summary_table = sgRNAs_summary_table, sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gammas2betas = gammas2betas_updated, null_distribution = null_distribution, simulation_n = simulation_n, test_type = test_type, guideindices2bin = guideindices2bin, averaging_method = averaging_method, padj_cutoffs = padj_cutoffs, effect_size = effect_size, limit = limit, scale = scale, estimate_statistical_power = estimate_statistical_power)
logger.info('Finished simulations to assess statistical significance of deconvolution profile ...')

except:
Expand All @@ -278,7 +280,7 @@

##### Output beta profile
try:
complete_beta_profile(gammas2betas = gammas2betas_updated, simulation_n = simulation_n, padj_cutoffs = padj_cutoffs, out_dir = out_dir)
complete_beta_profile(gammas2betas = gammas2betas_updated, simulation_n = simulation_n, padj_cutoffs = padj_cutoffs, estimate_statistical_power = estimate_statistical_power, out_dir = out_dir)
logger.info('Successfully created beta profile file ...')

except:
Expand All @@ -296,7 +298,7 @@

##### Output IGV tracks
try:
crispr_surf_IGV(sgRNA_summary_table = sgRNAs_summary_table.split('/')[-1].replace('.csv', '_updated.csv'), gammas2betas = gammas2betas_updated, padj_cutoffs = padj_cutoffs, genome = genome, scale = scale, guideindices2bin = guideindices2bin, out_dir = out_dir)
crispr_surf_IGV(sgRNA_summary_table = sgRNAs_summary_table.split('/')[-1].replace('.csv', '_updated.csv'), gammas2betas = gammas2betas_updated, padj_cutoffs = padj_cutoffs, genome = genome, scale = scale, guideindices2bin = guideindices2bin, estimate_statistical_power = estimate_statistical_power, out_dir = out_dir)
logger.info('Successfully created IGV tracks ...')

except:
Expand All @@ -321,6 +323,7 @@
'correlation': correlation,
'genome': genome,
'effect_size': effect_size,
'estimate_statistical_power': estimate_statistical_power,
'padj_cutoffs': ' '.join(map(str, padj_cutoffs)),
'out_directory': out_dir
}
Expand Down
Loading

0 comments on commit 57df481

Please sign in to comment.