-
Notifications
You must be signed in to change notification settings - Fork 9
/
liftover_intropolis.py
162 lines (153 loc) · 6.9 KB
/
liftover_intropolis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
"""
liftover_intropolis.py
Lifts intropolis over to hg38 and writes output including both hg19 coordinates
and hg38 coordinates.
Requires
http://hgdownload.cse.ucsc.edu/goldenpath/mm10/
liftOver/hg19ToHg38.over.chain.gz
liftOver executable available from
https://genome-store.ucsc.edu/products/
intropolis.v1.hg19.tsv.gz .
Writes to stdout. We ran
pypy liftover_intropolis.py
--liftover /path/to/liftOver
--chain /path/to/hg19ToHg38.over.chain
--intropolis /path/to/intropolis.v1.hg19.tsv.gz
| gzip >intropolis.v1.hg19_with_liftover_to_hg38.tsv.gz
Tab-separated output fields
1. hg19 chrom
2. hg19 start (1-based, inclusive)
3. hg19 end (1-based, inclusive)
4. hg19 strand
5. left motif (e.g., GT)
6. right motif (e.g., AG)
7. comma-separated list of indexes of samples from
intropolis in which junction was found
8. comma-separated list of numbers of reads in corresponding samples from
field 7 overlapping junction
9. hg38 chrom or NA if liftover unsuccessful
10. hg38 start or NA
11. hg38 end or NA
12. hg38 strand or NA
"""
import tempfile
import gzip
import shutil
import atexit
import subprocess
import os
if __name__ == '__main__':
import argparse
# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--liftover', type=str, required=True,
help=('path to liftOver executable available from '
'https://genome-store.ucsc.edu/products/')
)
parser.add_argument('--chain', type=str, required=True,
help=('path to unzipped liftover chain; this should be '
'hg19ToHg38.over.chain')
)
parser.add_argument('--intropolis', type=str, required=True,
help='path to intropolis.v1.hg19.tsv.gz'
)
parser.add_argument('--temp-dir', type=str, required=False,
default=None,
help='where to store temporary files; defaults to TMPDIR'
)
args = parser.parse_args()
temp_dir = tempfile.mkdtemp(dir=args.temp_dir)
to_liftover = os.path.join(temp_dir, 'to_liftover.bed')
temp_hg38 = os.path.join(temp_dir, 'hg38.bed')
temp_hg19 = os.path.join(temp_dir, 'hg19.bed')
with open(temp_hg19, 'w') as hg19_stream, gzip.open(
args.intropolis
) as input_stream:
for i, line in enumerate(input_stream):
tokens = line.strip().split('\t')
chrom, start, end, strand = (
tokens[0], str(int(tokens[1]) - 1),
tokens[2], tokens[3],
) # zero-based, half-open coordinates for BED
# Tack original junction onto junction name
junction_name = ';'.join([str(i), chrom, start, end, strand])
print >>hg19_stream, '{}\t{}\t{}\tinfo_{}\t1\t{}'.format(
chrom, start, end, junction_name, strand
)
# Convert junctions from hg19 to hg38
liftover_process = subprocess.call(' '.join([
args.liftover,
'-ends=2',
'-minMatch=1.0',
temp_hg19,
args.chain,
temp_hg38,
os.path.join(
temp_dir,
'unmapped.bed'
)
]),
shell=True,
executable='/bin/bash'
)
to_sort = os.path.join(temp_dir, 'intropolis_and_liftover.tsv.gz')
with gzip.open(to_sort, 'w') as both_stream:
with open(temp_hg38) as hg38_stream:
for line in hg38_stream:
chrom, start, end, name, score, strand = line.strip().split(
'\t'
)[:6]
(_, hg19_chrom, hg19_start,
hg19_end, hg19_strand) = name.split(';')
hg19_start, start = int(hg19_start), int(start)
print >>both_stream, '\t'.join(
[hg19_chrom, str(hg19_start + 1), hg19_end,
hg19_strand, chrom, str(start + 1),
end, strand, 'FAKE']
)
with gzip.open(args.intropolis) as intropolis_stream:
for line in intropolis_stream:
print >>both_stream, line,
sorted_together = os.path.join(temp_dir, 'sorted_together.tsv.gz')
subprocess.check_call(
'gzip -cd {} | sort -k1,1 -k2,2n -k3,3n | gzip >{}'.format(
to_sort, sorted_together
), shell=True, bufsize=-1
)
with gzip.open(sorted_together) as sorted_stream:
junction_1_tokens = sorted_stream.readline().strip().split('\t')
junction_2_tokens = sorted_stream.readline().strip().split('\t')
while True:
if junction_1_tokens[:4] == junction_2_tokens[:4]:
# Liftover available
if len(junction_1_tokens) > len(junction_2_tokens):
hg38_tokens = junction_1_tokens
hg19_tokens = junction_2_tokens
else:
hg38_tokens = junction_2_tokens
hg19_tokens = junction_1_tokens
print '\t'.join(hg19_tokens + hg38_tokens[4:8])
junction_1_tokens = sorted_stream.readline().strip()
if not junction_1_tokens:
# End of file; nothing to print
break
junction_1_tokens = junction_1_tokens.split('\t')
junction_2_tokens = sorted_stream.readline().strip()
if not junction_2_tokens:
# End of file; print junction 1 tokens and sign out
print '\t'.join(junction_1_tokens + ['NA'] * 4)
break
junction_2_tokens = junction_2_tokens.split('\t')
else:
'''Liftover not available for junction 1, but have to check
junction 2 against next junction.'''
print '\t'.join(junction_1_tokens + ['NA'] * 4)
junction_1_tokens = junction_2_tokens
junction_2_tokens = sorted_stream.readline().strip()
if not junction_2_tokens:
# End of file; print new junction 1 tokens and sign out
print '\t'.join(junction_1_tokens + ['NA'] * 4)
break
junction_2_tokens = junction_2_tokens.split('\t')