-
Notifications
You must be signed in to change notification settings - Fork 1
/
overlap_bps_with_feature.py
236 lines (212 loc) · 12.7 KB
/
overlap_bps_with_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/bin/env python
import os
import shutil
import sys
import subprocess
import argparse
import time
import random
from multiprocessing import Pool
parser = argparse.ArgumentParser()
parser.add_argument("bp_file", help="BED file of breakpoint regions")
parser.add_argument("bp_name", help="Name of this set of breakpoint regions")
parser.add_argument("feature_file", help="BED file of features to test against")
parser.add_argument("feature_name", help="name of the features being tested against")
parser.add_argument("genome", help="FAI file of the genome")
parser.add_argument("--gaps", help="BED file of gaps in the genome")
parser.add_argument("--permutations", type=int, default=10000, help="iteration number")
parser.add_argument("--cores", type=int, default=100, help="number of worker cores on the cluster to use")
parser.add_argument("--replot", action='store_true', dest="replot", help="only replot a directory, don't redo the permutations")
parser.add_argument("--shift_only", action='store_true', dest="shift_only", help="only do shift analysis")
parser.add_argument("--process_iteration_chunk", help="one of bp_hits|feature_hits|bases_overlapped")
parser.add_argument("--iteration_number", type=int, help="starting iteration number")
parser.add_argument("--iteration_chunk_size", type=int, default=100, help="number of iterations in this chunk")
parser.add_argument("--use_condor", action='store_true', dest="use_condor", help="execute permutation workers on the condor scheduling engine")
parser.add_argument("--verbose", action='store_true', dest="verbose", help="execute permutation workers on the condor scheduling engine")
parser.add_argument("--plot_hist_color", help="color to use in the plot histograms", default="blue")
parser.add_argument("--actual_observation_label", help="label to use for the observed data point on the plot")
args = parser.parse_args()
bp_file = args.bp_file
bp_name = args.bp_name
feature_file = args.feature_file
feature_name = args.feature_name
genome = args.genome
gaps = args.gaps
process_iteration_cmd = args.process_iteration_chunk
iteration_number = args.iteration_number
iteration_chunk_size = args.iteration_chunk_size
permutations = args.permutations
cores = args.cores
replot = args.replot
shift_only = args.shift_only
use_condor = args.use_condor
verbose = args.verbose
plot_hist_color = args.plot_hist_color
actual_obs_label = args.actual_observation_label
script_path = os.path.dirname( os.path.realpath( __file__ ) )
def process_iteration(i, fun, gaps, bp_file, genome, feature_file):
if verbose:
sys.stderr.write("running iteration %s\n" % i)
if gaps != None:
command = ['shuffleBed', '-excl', gaps, '-i', bp_file, '-g', genome, '-chrom']
if verbose:
sys.stderr.write(str(command) + "\n")
shufflep = subprocess.Popen(command, stdout=subprocess.PIPE)
else:
command = ['shuffleBed', '-i', bp_file, '-g', genome, '-chrom']
if verbose:
sys.stderr.write(str(command) + "\n")
shufflep = subprocess.Popen(command, stdout=subprocess.PIPE)
sortp = subprocess.Popen('sort -k1,1 -k2,2n', stdin=shufflep.stdout, stdout=subprocess.PIPE, shell=True)
shuffled = sortp.communicate()[0]
hits = fun(feature_file, '-', input=shuffled)
return (str(i) + "\t" + str(hits))
def wrap_process_iteration(arg_tuple):
myargs = arg_tuple
time.sleep(random.random() * 60)
condor_run_prefix = []
if (bool(myargs[7])):
condor_run_prefix = ['condor_run']
verbose_opt = []
if verbose:
verbose_opt = ['--verbose']
sys.stderr.write("verbose opt: " + str(verbose_opt))
gaps_opts = []
if myargs[2] is not None:
gaps_opts = ["--gaps", str(myargs[2])]
iteration_command = condor_run_prefix + ['python', script_path + '/overlap_bps_with_feature.py', myargs[3], "none", myargs[5], "none", myargs[4], "--process_iteration", myargs[1], "--iteration_number", str(myargs[0]), "--iteration_chunk_size", str(myargs[6])] + gaps_opts + verbose_opt
if verbose:
sys.stderr.write(str(iteration_command) + "\n")
result = subprocess.Popen(iteration_command, stdout=subprocess.PIPE).communicate()[0]
return result
def compute_bases_overlapped(feature_file, bp_file, input=None):
if bp_file == '-':
random_bp_hits = subprocess.Popen(['bedmap', '--bases-uniq', bp_file, feature_file], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate(input)[0]
else:
random_bp_hits = subprocess.Popen(['bedmap', '--bases-uniq', bp_file, feature_file], stdout=subprocess.PIPE).communicate()[0]
bases_overlapped = 0
for line in random_bp_hits.split("\n"):
if line != "":
bases_overlapped += int(line)
return bases_overlapped
def compute_bp_hits(feature_file, bp_file, input=None):
if bp_file == '-':
bp_hits = subprocess.Popen(['bedops', '--element-of', '-1', bp_file, feature_file], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate(input)[0]
else:
bp_hits = subprocess.Popen(['bedops', '--element-of', '-1', bp_file, feature_file], stdout=subprocess.PIPE).communicate()[0]
hits = len(bp_hits.split("\n")) - 1
return hits
def compute_feature_hits(feature_file, bp_file, input=None):
if bp_file == '-':
bp_hits = subprocess.Popen(['bedops', '--element-of', '-1', feature_file, bp_file], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate(input)[0]
else:
bp_hits = subprocess.Popen(['bedops', '--element-of', '-1', feature_file, bp_file], stdout=subprocess.PIPE).communicate()[0]
hits = len(bp_hits.split("\n")) - 1
return hits
if process_iteration_cmd == None:
if not replot and not shift_only:
iterations = permutations
p = Pool(cores)
results_dirname = bp_name.replace(' ', '_') + "_to_" + feature_name.replace(' ', '_')
if (os.path.exists(results_dirname)):
sys.stderr.write("Directory %s already exists; please remove it before re-running\n" % results_dirname)
sys.exit(1)
# compute the real number of bps that overlap a feature
num_real_bp_hits = compute_bp_hits(feature_file, bp_file)
print "num bps that overlap features: {0}".format(num_real_bp_hits)
# compute the real number of features that overlap a bp region
num_real_feature_hits = compute_feature_hits(feature_file, bp_file)
print "num features that overlap bps: {0}".format(num_real_feature_hits)
# compute the real number of bases overlapped
real_bases_overlapped = compute_bases_overlapped(feature_file, bp_file)
print "num bases in bps that overlap features: {0}".format(real_bases_overlapped)
if not replot:
if not shift_only:
# use shuffleBed to permute bp locations
print "Computing permutations for BP hits"
bps_with_hits_file = open('bps_with_hits.txt', 'w')
num_random_bp_hits = p.map(wrap_process_iteration, [(i, "bp_hits", gaps, bp_file, genome, feature_file, iteration_chunk_size, use_condor) for i in xrange(0, iterations, iteration_chunk_size)])
for ip in ("".join(num_random_bp_hits)).rstrip().split("\n"):
fields = ip.split("\t")
bps_with_hits_file.write(str(fields[0]) + "\t" + str(fields[1]) + "\n")
bps_with_hits_file.close()
print "Computing permutations for feature hits"
feature_hits_file = open('feature_hits.txt', 'w')
num_random_feature_hits = p.map(wrap_process_iteration, [(i, "feature_hits", gaps, bp_file, genome, feature_file, iteration_chunk_size, use_condor) for i in xrange(0, iterations, iteration_chunk_size)])
for ip in ("".join(num_random_feature_hits)).rstrip().split("\n"):
fields = ip.split("\t")
feature_hits_file.write(str(fields[0]) + "\t" + str(fields[1]) + "\n")
feature_hits_file.close()
print "Computing permutations for bases overlapped"
bases_overlap_file = open('bases_overlapped.txt', 'w')
num_bases_overlapped = p.map(wrap_process_iteration, [(i, "bases_overlapped", gaps, bp_file, genome, feature_file, iteration_chunk_size, use_condor) for i in xrange(0, iterations, iteration_chunk_size)])
for ip in ("".join(num_bases_overlapped)).rstrip().split("\n"):
fields = ip.split("\t")
bases_overlap_file.write(str(fields[0]) + "\t" + str(fields[1]) + "\n")
bases_overlap_file.close()
print "Computing shifts"
# shift the regions over and compute overlaps
shifted_bp_hits_file = open('shifted_bp_hits.txt', 'w')
shifted_feature_hits_file = open('shifted_feature_hits.txt', 'w')
shifted_bases_overlapped_file = open('shifted_bases_overlapped.txt', 'w')
for i in xrange(-1000000,1000000,25000):
shifted_regions = subprocess.Popen(['slopBed', '-i', bp_file, '-g', genome, '-l', str(i), '-r', str(-1 * i)], stdout=subprocess.PIPE).communicate()[0]
# regions very close to the ends of chromosomes can end up with starts greater than ends; need to filter them out
filtered_shifted_regions = []
for line in shifted_regions.split("\n"):
if line == "":
continue
fields = line.split("\t")
start = int(fields[1])
end = int(fields[2])
if start < end:
filtered_shifted_regions.append(line)
else:
sys.stderr.write("filtering out shifted region: " + line + "\n")
#sys.stderr.write("filtered_shifted_regions: " + "\n".join(filtered_shifted_regions))
#sys.stderr.write("left with {0} shifted regions\n".format(len(filtered_shifted_regions)))
bp_hits = float(compute_bp_hits(feature_file, '-', "\n".join(filtered_shifted_regions))) / len(filtered_shifted_regions)
shifted_bp_hits_file.write(str(i) + "\t" + str(bp_hits) + "\n")
feature_hits = float(compute_feature_hits(feature_file, '-', "\n".join(filtered_shifted_regions))) / len(filtered_shifted_regions)
shifted_feature_hits_file.write(str(i) + "\t" + str(feature_hits) + "\n")
bases_overlapped = float(compute_bases_overlapped(feature_file, '-', "\n".join(filtered_shifted_regions))) / len(filtered_shifted_regions)
shifted_bases_overlapped_file.write(str(i) + "\t" + str(bases_overlapped) + "\n")
shifted_bp_hits_file.close()
shifted_feature_hits_file.close()
shifted_bases_overlapped_file.close()
if not shift_only:
results_dirname = bp_name.replace(' ', '_') + "_to_" + feature_name.replace(' ', '_')
# plot the results
print actual_obs_label
if replot:
plot_args = ['Rscript', script_path + '/plot_results.R', feature_name, bp_name, num_real_bp_hits, num_real_feature_hits, real_bases_overlapped, results_dirname, plot_hist_color]
if actual_obs_label is not None:
plot_args = plot_args + [ actual_obs_label ]
print plot_args
subprocess.call(map(str, plot_args))
else:
plot_args = ['Rscript', script_path + '/plot_results.R', feature_name, bp_name, num_real_bp_hits, num_real_feature_hits, real_bases_overlapped, '.', plot_hist_color]
if actual_obs_label is not None:
plot_args = plot_args + [ actual_obs_label ]
print plot_args
subprocess.call(map(str, plot_args))
os.mkdir(results_dirname)
shutil.move('bases_overlapped.txt', results_dirname + "/")
shutil.move('bps_with_hits.txt', results_dirname + "/")
shutil.move('feature_hits.txt', results_dirname + "/")
shutil.move('shifted_bp_hits.txt', results_dirname + "/")
shutil.move('shifted_feature_hits.txt', results_dirname + "/")
shutil.move('shifted_bases_overlapped.txt', results_dirname + "/")
shutil.move('results.txt', results_dirname + "/")
shutil.move(results_dirname + ".pdf", results_dirname + "/")
elif process_iteration_cmd == "feature_hits":
for i in xrange(iteration_number, iteration_number + iteration_chunk_size):
print process_iteration(i, compute_feature_hits, gaps, bp_file, genome, feature_file)
elif process_iteration_cmd == "bp_hits":
for i in xrange(iteration_number, iteration_number + iteration_chunk_size):
print process_iteration(i, compute_bp_hits, gaps, bp_file, genome, feature_file)
elif process_iteration_cmd == "bases_overlapped":
for i in xrange(iteration_number, iteration_number + iteration_chunk_size):
print process_iteration(i, compute_bases_overlapped, gaps, bp_file, genome, feature_file)
else:
sys.stderr.write("Bad process_iteration commmand!\n")